diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 2c23a711ce..b3dc19bf97 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:17f34a6d5d51c8e1f9ff6e7214de27fde634db43225c384c2587fc9462f1748c +oid sha256:5dbeef553388e130f1b41304228f629807dabac3c5e0c66a0c28ea9833340040 size 630932 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index 17f42ca11c..cc6edba634 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:bf58b9c01ac2b2ead1fff8b6f77f831df63c6c4e7f7e53dd6bca2d491aba5f62 +oid sha256:a90e201852a6f3c911e982a9f819a6be765814c39a24b0c9f53ce4dcb4d28c33 size 565951 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 026f8f6326..13b2b7f6a8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4ff262f0ca67a42acdad47466eef7927fc160e86bf419eb1c2dad23eaa124158 +oid sha256:7b3100a44dbe296bc51f7ff18efd9dd1b03649bd993a1cc54da0585208395ea0 size 483501 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 1779d86ec9..e968a81b6f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5606453cb0b2ad5cc87fe714b9f3db12149a074cd942e3e3fcf81e39b1b8e435 +oid sha256:8ee2df4ea7203351cd5d12be09234ea39cb0a0a3d1f6a7d8f87790f5f02991cf size 452197 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 45986dd875..6c8cebee02 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:01b753b3b47322d84e41f3da336fc412e506a07896f404b698995e1f26a3ceb7 +oid sha256:70bef448c376a3e57294f21afc296ea1e7a1f21bca158cecb995de47fcf7a9e8 size 471975 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 8e9854b8a4..1904b16b53 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:09964a2485eb822cd27cb559014c80200cf3dd779a10f94a547f76e6ca795408 +oid sha256:abc46c06eba3f60de4c29a4d06b672c0fd406f0b94e82c4828919c3ad173e094 size 447801 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 8aea16ca13..290496288f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f2b406054d3d62e07d36ce3c9c81ea7d91a7b5a8a605ceecbe4ff0c7fa4ea825 +oid sha256:707f1643ecd4243edbc2552b1c14e13154f659f9653122386c7d71fd97762626 size 626982 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp index e6080eb804..00769445b0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:44398e40e44aa59b773ddc47980d87e4e7db7f2f8bb79fa1cd47c94ad1c5a7a1 +oid sha256:949736949a7d8db4eaff5171d6a280fc75e7597e78d5cf53cd698bf58143599c size 566023 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index fdae44ee22..77c78a342c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:359a7f91fabf1fabe222aa4204b35726a2a0bbf4f78d3fe1233e9dd1dba6d37a +oid sha256:620338e0232c4bc83022026e927ad4337b3438bfe5225a771d71ae821708a67d size 452573 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 3fe5963284..e1105a4183 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6ab77904c1ac7c61e8244fdd8693f0284152f7ef256a467d41576cd0ca4f9b70 +oid sha256:501bf6e66db0c3a774fc7505841ccf5b02da306e43522b0a694d61953cd56864 size 394599 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index d23f9c1d10..4fddf6fa16 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a977331738ef6f94c056fb0415067677577a3ccc0b4f5b35ba9f1af81ecb1da2 +oid sha256:f0f40f5a666044cca61e289b9c48c98977691b9e1d32a72179128ccbee15cc55 size 436291 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index becdb7340c..131a7372e1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c8753486471497fcd14296cbe79ed0217fd96c736e8bcaae2ba7f64e86b56f17 +oid sha256:c59673a8be4362f46f2e9257ec7c4e38a4800c45b1945f9df0cb32ed0c6d43ae size 378317 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 91ac5e2f82..0179359809 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5e86ee3fcc48ddae27a34f84cf83a7c61a634a27277fb96b33e75cd5c78f6a52 +oid sha256:34c1fd4fa79d1384876b968ffbf432fefdf981338a7956e296799295d1d0247c size 505637 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 4d42ac64fa..67d3cc1421 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:356c67ce617da2547bc3969fa893638bb1b824684db7eeada2b64713c69e0dfe +oid sha256:055d697387425f909c5329b792c5db97a58cc847cc7755169de50f99d0ed5f73 size 468807 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 6789cc7b4d..4911a5924c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ddaf641d7ed2bf7bc20f5a84bfb4b801297d83d00baa320f1b38948bc575aeb7 +oid sha256:60c0b37d90c1fc5e3cb27e480d84d3b18d47cf7b1cd4e3a3af35f941100e54aa size 494111 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 4338c07ed4..e1f14eded7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d35dc6e393dbb3e06a8ded38d3beff7977d4b768192ece6fec69939df0d0454d +oid sha256:a3b9f3aee4f1a1d79c4876a4fec9edb0baa4c78efec3c76c04ca8bd00af76963 size 469935 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 0d1994b518..15cc8270a7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3404acdcd673f67dbd15f247cfdea00c87f19370defaba5884b5b9174fa596fa +oid sha256:2f417704dbb04df29ad9019c106a1de5aa4f4415e7b7aa206600843b30645000 size 670528 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index 45b61e0a50..f2f547d237 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5d7fd84cc238fb1002ac9fbe5ffecc1dda3fa40ac028f68bbca44b5a74017aeb +oid sha256:b73aa21041034ab532412a51cf00dda4d8ffbb5d3970628dc5fc9fc0dbd39e24 size 607571 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 2e54ad685e..158609a3e9 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:bad8752b471b77f4a8343f9fdb4127956f55ff1aee21b2c0d606daebb1083880 +oid sha256:dfac7eaa6285ea9b8e02a1ac7318a14a89bd04dfd9794d194c55e45583aaeb1d size 481715 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 4b40e7ec6c..6ae979c279 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:70255b5c11bfb4e06de22f9106aecb070d0a336568b121ac3f9bdc59ac8ad9fa +oid sha256:b18b4478ee849f9f92ff59f68887710b0c9af43d57819821e64ab81b7c8fb1cf size 411875 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 51ec1bb4ee..ddb87d88e1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a20420229225ae1c2c0e853a0a585d04793e435d63cd6090eb7e38d9cbbb18e2 +oid sha256:b39f08d8599260d351b6df4e142512bb75dbaa219a4a701a26863c0566d3e72a size 460697 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index ba23a80cb1..414dbd42b4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8b43826a9a48541cba45ff0082283e8a069cbbc9894dae055a2402cbccef7777 +oid sha256:59497957d8e27c087941c6bc5db6dbeabbd0e02f98b3d359d0ba347698927f8f size 394015 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 54ea047c66..b77a11808b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0fd569e36929c1a54ec573f5e1f53d65ce3630c5689920fad2864422f7dea7e6 +oid sha256:55485e3210691cae102219ccc7c6dc4428a79a3115887447e8d584ac25103cf5 size 663022 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index 10b4a0488b..86d2475f54 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:af6d5c8888d51e54ffcdb05f4c42016673c684c878f901be8025edc6e490b2aa +oid sha256:7fd46e418a1d9ca3563211e581bf352207e88bef95293f71a15a63f516e008e4 size 574435 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 70274c6d62..eb169b5463 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0bd694d0ba8cc061c6932b070779d57815336f59075d189f57ff010103a621c7 +oid sha256:8e9a80632812c1d98e05131359d8d0282a0219766f4f949257804e0da566c479 size 593219 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 06b985e9cd..cea57988a6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d41cb296ae0343e7f017be89295d49233757f5cac6edf07c3cc96cd6bb907bf4 +oid sha256:587b63d2d1e7fb17f0ff8ab7483e4fcba28e4944bcee169dd5d5ca2909b567c3 size 569019 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index fc9bac9390..f8f218e683 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a88239db53ab73c5589f06469a4afba6d20c1f23c8033e740a628d970697d93c +oid sha256:7584eb0eb94587d360419bf58a4df610a61b99069b044f82c9bfa8669cc1ad2f size 580115 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 121fdb649d..ec0489c79c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5049e27f0756a25106f5615d60e539bc514a798a722146380622b8d2ca32694e +oid sha256:a578404ca82c1fdf9be383d49d0f1e7664e935088fb33d2bc1bdb4baba166d41 size 559885 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp index dd6bb818bf..75a42bf3d6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1ab33d785fb970903081deca276f6904aa4a58101b17839eefb96f6f324af263 +oid sha256:01801e6a36b2ad80cc9bf4ff90586b853694271cef751f55d2c5ba5eeea401d8 size 663760 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp index ed4b8c2646..b896cc98f6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:aad7ae4f3f4dd336159cbd10009f072359d428463d552cd478d30fdbc7e936b5 +oid sha256:d444b0c26d99f3dc92f58a172064f3f7c21252164e9cb127f6b5ea3cfd8f9af1 size 608499 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 7f599daf44..74481fc1e6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:57c8aff9083d68a7979c626e6ba095eb8e48b61fddceddf3721a65f71877be34 +oid sha256:a2de5904a5271e374185323f2029b11fe07edbdf22666a562cec4e9868915e57 size 558345 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index deae2bf21f..33b71349b2 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a566742e538f476775da8d6bb0566f21bd9b8c0491b427d4818a450faf2d8148 +oid sha256:c4bc2d2f470b216d493ecd2a9d559fef26d27d34a5c9d5a44e5b890beedbc6eb size 501159 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index f7df84b8ea..bd6262eb1f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:50402fdd3ed68147dfe0b021c6c8b2515a00332795734f9451ce39edd666573e +oid sha256:6bdec3801521171bf2fc0442ccda65a5d66e36be28ec620763974a7fd8d51c1a size 539695 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index dbbcb79966..9ba2e2b408 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:500a9236fc5e897b2c406d755def2dbc67cabb00f097e4b8c34419511cab2f99 +oid sha256:e29787b3c4dbe5adb56c163d3e8b89191afd58615add63acdedc8a760ca3814c size 483299 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 8199bf63db..194042c056 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b2a8fade86e5093027c001b27a612a5e7ee18ae143e08ccd00043f4bc32ccdca +oid sha256:f0584f5e2953b9827bb31c1533f9e8a36e2998bf952f30c87630c2db48362b6a size 616933 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 51c1285291..8fd9ffd63c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:28bd8634521092266655956342505e5f978559a41100e22c5fd0c801855bf166 +oid sha256:ce98ce92ba5132756ce4b5bd3ec350e47b80ef06e573baffdfae212ba0236d57 size 591153 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index d2fa42edc3..907b7e3dca 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e5b51830511bdb82a252cd25a06167e7f97bc160e80e8104ce556ae37229ff67 +oid sha256:be20a25127519e3e4816c0722c1868f9afd8556118d0ede9668a7313b781c0be size 603039 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 084ffd3bdf..25249efd70 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9840bbba2d04a1c72e61e7c9bff213eea3a1068ec19998e34cd5916e0b9e0f3a +oid sha256:a00e5f26406e9cfcf828c84c9cb054f096bf96d238989ddffef1da097243bb49 size 582021 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 56add4e2af..5d2b6b37a0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:19f636770a8f6b8c2112c8d6c1eb8d991207d7ee9f67b4c8c18a7f0028602b6f +oid sha256:bf56cc695ada25ad4b2707dadf77d0c98c3087e2f52ee6c89b2a9d59d438cc8b size 702670 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index 650ded98a0..6f579f6557 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:df92ad291cf78872f979709e07e4a85b1647a34bc76dd53ca31d71ce08e62b37 +oid sha256:350653f2f4134348c19bdc0460dcff2f693023b783023a4ed6a1eacae1715dbc size 617191 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 1500c7ff54..c96a2b12dd 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c7b6b8e819ec825f331ed01b7d9f7d7fe115050ca29fe6c3d31e87146250d56a +oid sha256:90dd023f70455b0788425890b8cd7b869b21f732e2073d6085a60fea86f8fe10 size 586695 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index cb1cba2b10..a8a3930e9a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8bca2490575bb5d45ac37a176b7da45fd91445e0ed058bb8aea0234941289cb2 +oid sha256:b77f111c0a72a1c478c4850102b729dba22c7d28ce72b950af96e6ca838e36b9 size 512909 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index b83889478e..92f947eb1c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:527acfcd515e35e2a383cddf972f2b7602a381327633ba20d6732319cb6bce98 +oid sha256:cbc91114ea6283743c584a2c908a431d0339d1f8f3c1b62d484e2147a6c91688 size 564889 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 64b394fa6d..764ae7da7c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:913334b768bf0af735fec8189dd0806f6b22f40a0cd30e10a538ce566237e7e1 +oid sha256:61472723f538d24c9acd2715d33bdec5b0b398c8721f760b2d1487a18f9c56db size 494259 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 1695cef5ee..9275c67885 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1e22d2b124c9d85ed319e17a5f36c6ffb46e47f4c520fe9c31c9fb46fd6a118b +oid sha256:4d573460b47edb6494ffaee16a7ad168fc838bc1f682276dcf883f3bf87dacd8 size 561491 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index 289a9d6915..1cccd823b5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2ca4fe139a26e0bde389ae0dfa2a3a76ffe79367562bf7ba454e5936808d6107 +oid sha256:d04d9dc89e714b2fffd0e2c3584a5acfae5d8f09b372d73331bc6bbf8ff5e31e size 502949 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 4927888369..d716d06c68 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1dfb56733642035a86b7ddf56294109ee185f122e2a69a493797839734ddeb4f +oid sha256:357f912b4b3932fbd1715ff5d3630cf0f57ad1ccc8345319f66f59dfe64a244a size 456883 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 214215958d..d512b1c652 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a776f2051102716638f00e252f1c20829719c386e9cdef9a418368996454320f +oid sha256:e50308861808e757f76cb664d4834089006cb5897ca20ca7c62047681b38d58d size 442723 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 441d5ac182..2dca924eea 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:356f207596ef727c1b12c4a01950869ebd066b583b6b2a7b9423c9804ee317dc +oid sha256:07088afdd4892f3359c43cd71a036eacf99aaa7a61ff5be89cd1a14380d69f57 size 450859 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 2fbdc52c5f..2eb05a48f2 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9f2035c671f6a86d4622a2061df21684f05c74d23e0c6c2d92f869b362dfe6db +oid sha256:cd3ae620c3742f516c4cd35b72773bc305e98b98b1cc4e63d44d5083d30f123f size 436747 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp index ede97e7fd3..e9b037bea7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:08a3c22687c7327734fd6c0013e2b2cd19003c29f72b89a5b5c53835950ae91b +oid sha256:10dfcdeb3f26e142320f4deaa7d3100d3b411fc45696bb21232a102b0694efc1 size 558333 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp index c97e64bd8b..e86e2bafad 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5f93ba7d2bcff0f78486277efe6f1c13fe2387aa70605b916cdd22fb15ce1c31 +oid sha256:bec6c7e277fa9f46c6e6650bab9c17611dc4cd1669b4d75211d233db672576b9 size 502233 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 23ca718e1e..b07bf09928 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:61d7b6e0b56fbe8d6b9cca3cb934f710391654765e76f6f74485baf525f50c8f +oid sha256:0b6ed54b526c428b08dbe9f7e80e7673fb8a2d985ce065d9ace6730c44dbcd28 size 441719 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 7e4b1eb907..2dcb8c2b4f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:434de3e33d46200c6aab0f120c05b16b2b29bed1f6f7d06a309c0b3cb2c9fdc9 +oid sha256:fee499381c761edd8ed3c707e9e86af26b7d8112b7285d8562364445b4db15e3 size 384533 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 7442692794..7a724f3e6b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:09d25663c25667d1ebb2a504ba6c261c87d77fa512d15232bc80cfbc65f5eb7f +oid sha256:527451a03c38f8a12a2fd09c911c4ea54cc89bd7f688108ea30694f661bbb5f1 size 424449 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 5ca2524dcb..810656c2d7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:17fa99b7bc15154573aaee6e6c0aaa918c6c959daec291cdade13e5152479921 +oid sha256:1e3cb444f826882d1f9fdde1d49bae9235f36fa683a9ab2b7879f82a4207d689 size 367265 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 62c6f4cbae..376b5d33e6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5d42e7ee8b11a2c06cdcf012826ee0020ba83494713d22225d358d2d90f9eccd +oid sha256:32f4a45f95b8ae84a0e3ae5a43b3b5f5c7362b1ce223819f516f4b4e73170bb7 size 473445 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 7005ab7209..f5fe8694b8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c4e5bd42ce6da3b5d4fba10b39ea5f6d755a103b9e1b6dfb1c43e942aded5850 +oid sha256:3d8a3cb32086a1867cce707e2300b7dae6b3f47db799a77848534a205e6846c1 size 458543 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 5d6fd3ce11..0c472509cd 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:63ee26044f6e3cdf5558f438c2afcd6a17a3a0208ff4635111831a8108d85075 +oid sha256:aa25c1b87150f6db0800cebb0d799ec948d40e80384c5312eedb52f73a1d6618 size 473783 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 4880a33e72..d9dc3e80b3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6ddf4863648d25cb764f8dc0a0c1f7fb750d128fb6f49100bf96668d2ca75079 +oid sha256:89c8a875579b5d4ff2f53c6925a71f5f382ec4e0763dd56ec92423aab9166a94 size 458883 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 1c851684c0..747d330547 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4955e59ea224d35862cdea6a1340bcdc1d7e26cdd0e9063b69dfc0bfac58e43b +oid sha256:a654c16808084190287ebf34535e68dc1f4d3d1f3c082912633e372f35df0648 size 601089 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index cbcc4fd6b8..e89cf9de1f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:85567463b222f06d5dc0de196753ac3e2675851a63010b7b0db0ac6a7b003b9a +oid sha256:1fdf46c99fb0c98db2c447a5e22075c9f3e037665a9945e61763adf2b3f5824f size 545605 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 49d10725f0..291316d67a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:26abc8741b5261ec9a388a4f13d50a1fb42fbc1891b50cd226c66ad9a0cdfe3b +oid sha256:04b2dd3f1d6ce30d084c944caf3301c274424a004097efdff1a345f72083dff8 size 470069 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 4290a65f6f..d722dac273 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ffeb03d13ffd0586579097348aa7622db56b1af2ab427e39feded5a85412bae2 +oid sha256:6fb4cc06f718df560c8de15140cabef4e98bd1d0f35e186cf37d5e94a8256863 size 400231 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 48bf2c4b2e..2c907f9434 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8bc137f117ddf4772b21786a2e244ccca365380a96b941c99168828126cd51ea +oid sha256:5b1d5f5ed80ed7d8d0c0dcf465f31ada29aeedaa85e4da693a684a5cf940d927 size 449643 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index fec1d1b3dc..3afcbf5251 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:abb74fdb16d0939e6f8e2ef6c9e210e2605c4606a11b356ab111bc8cea92b3b2 +oid sha256:06c57c87ed62e80008fb18fe5688f1f633c45aa0cc1a66f826c1efac62af8bdf size 382961 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 84663f19d5..8f55d3dd36 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ab4b2cbfb75c57ab5e13b6f7b83ba8bb242a7e467e72b39aaf434c4bc7c6aead +oid sha256:e2568162532c48e842319952122b3835e4ef014861cff4f4e37491d3703ef58a size 749338 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index c9ead486df..551e3e18cd 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f4f5ea892f241b754a11f09517c7ba065046e9b65c35ebcb0bedd84c745eba9c +oid sha256:75be8ecca1ec7acabb23fe44d3cd08fa50d46a2600a921c8e2197ff731141607 size 776790 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp index 2124398051..7841edce69 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:848644b2465232f39be0f82ee864ceb30e872c5a951bfcb80ee52bdd7d51bc13 +oid sha256:883f192f66396084fb325abbb832687f2f27887c0018a970e39bcae575b4aa95 size 753232 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 2772040f2a..f176b5baf9 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a079ac7c297f2bcb5b6e2008140e2669ee3b50d2d49f3ee8b78cef9f9e80c33d +oid sha256:dec9396cbb1bdd3ca6dfd831b9c957ad2fa31e0ddf2a6455b91b6aeab6c7d06b size 773336 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 8e00fb2b23..8e4d77280c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ed32d5e1a399b19c5b5a951b51efdd725591ab5910e434e47b9219ed9e8de053 +oid sha256:98265cb4cc56fe7336995f39759b8a4dbd6d35b68f9c934d746ec9304c16e488 size 818090 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 919d1ec7bd..b4c838d518 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:269abd26b450b88d2fa3ebef00b8b50c9bf7acad9f49d8d7aa9dede9cfcc5297 +oid sha256:7919d7baf0bd29baa550b4c9331cb5b74c9184211648a2ae4309956fe4b707e1 size 844952 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 85316cc4c5..b5f25509af 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3f9bc3320af971ab6c75b95a8f0757f39996222f5ef84735cc7ed5ca2f87e7e9 +oid sha256:a44bc023a76941f7445520e260d0c0fc8760063e0d8955b19a83574bae6075e6 size 848746 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp index 872b65ddec..dd844a4057 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d2efa0d321c999e39ba1aa4ea2ce76b9298cda87df4c5e9ff5e14cba99cae803 +oid sha256:fafce52791f2b4abe59c1bcc716eaf0b3098d5f713a527ffb33e39675e976f96 size 825966 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index f715975359..bafee37e73 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e9aba7ec5f3cc7c07a1727ba7a453b8f8f502e1ace7cb8342f95ba155c966642 +oid sha256:16a8725bcaf79441df397b8d8b68150c9eff705b3f1672c19cf6f3054faef8a8 size 719058 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp index 5d5f98152d..ab882126f5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:57f9664d0518bd05aa601b8500f3f1043bcfe15d6c6da83393ca8147046dcec5 +oid sha256:22ff4060835564fc0a59b887d6c7fc7f09acb22e280d39304cd6ce2226374002 size 805738 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp index 0a102f8d67..64138e2f53 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:23c707523597fdece25c21a724397f58fe46b433cca3b2c2e861f19f53370332 +oid sha256:4ff69794880a0f1144844bc8d521c524e2ae134655371fbd3db37900b3d5c13f size 720982 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index d700fc8850..84c25c408b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:eca4a5d71eafe647fc4a9f233b07279bedee9e91fbbf59c2492728a23a4d3a77 +oid sha256:94b49a1cbd0517790d651a87cf311e55a39f85379dee8753ffae750369dbbd83 size 658228 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index b4e906457e..a1891367f1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:71ee2dd17a4674017c8027c51a222fb1e86336f8a8a167fba7986bb2795e2b24 +oid sha256:e866754a3c408508951f101c0e4b41ea5a990902455bb83a435f1603422b038f size 873782 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp index 95cd814833..b2f316fafe 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:64c39c397708c9a68ed4d7adcc3474abe4eb665337c380c9bd4e7bcbd01846ef +oid sha256:6785f8bec42a6a4d110f66c16511c87526f488137bfe5b1f802a706ec5fc00ab size 811068 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp index 5db059f718..0b9d15be29 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c99e03823e2b145ff0d1af2e350c43bffe57f98f8daf544813db4ca5191dbf01 +oid sha256:ac719101e6277459fe356b5fbc639648cf54e60938991edce9f40ff966a68379 size 853024 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp index 83c6d7edb7..3969f5b9bf 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7bbe1a71f2776f6686e7f901a117bf4ddaff061f0797c6e3eaded43041fddb60 +oid sha256:1784583d9f22ca737eacd69e82bfdcc91c86aa16382937aa736eca54d6ee5803 size 598579 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp index fad42b14d9..b6533aa0f1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:98314cc5d14750ab1e9831bedb3133488ad61d89d1296ed1babae090e65c40de +oid sha256:725eca0a4c0be1060a784a222ea689be1c6f4c0322a78a2cc294dc4d2dffa994 size 558765 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp index 381d67f33d..838153ffb0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:710d1ae3553712b020e69d642e72ec216938f0aaaf1bd7d638ffa6ec2e1d5f42 +oid sha256:5809af60b8c4508d4d2c42949f453e45c7974fa3121a78a463b1d33d38fcd99b size 585301 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp index afc03518fa..7bba3466f6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e4e98665d17882d7bd5af2fad97016be5b5a8b130885bf506020e98a6f76d7c8 +oid sha256:58932c38a8501a22357c2ec29bf13fdd04c3d2f7bdb899514dc98c4d43e823d2 size 550075 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 5b30b3ea88..592e87f30d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9efc3e52d0e0f4cdbfa3aa854a5f788160d00e7df238b0952b465824faf27b88 +oid sha256:ae1455f120f347e608b644c583c1b35fd215aaba738c0cd0deb517d98df7aa1f size 846870 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 6466914c29..03433ca380 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:10bfa182894967dfa34e92e8b1d6bc96f9fd7fb94a299d6ab97b74194947f2e4 +oid sha256:4fa0236db82315f7dbeac5dd0033a5b91e98c78fed338ffecfbee527ae46cad6 size 871954 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 9ae57d354e..54b07be0f6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:60cee192a45eef5217c3c9773295cdfe263c0b486770b15bc96c44a0af142659 +oid sha256:d92ee16e232949ae212facd1456065987aeadca6dbba6bb4220e7595dae7493a size 603887 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp index f907b2c1ae..b562a07481 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:12f6e2fc8fac9ff38bd3497c127165afa2f4c11b08402d027f4707216ea76d03 +oid sha256:9b2f7bf4cfb5028b0cdc15d765936b09876438012ccb6034eee805f17680cdab size 503435 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 3d28f1ae05..71bd6e0aa9 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9dbb08135c71cb820bcd83f9da1cbf4fd128866df5c863f053d7d34fe92664d6 +oid sha256:ff15ba338aebcf63f10b718e98df516f6f7b9f80c782ec94143719bc5b22c0fd size 572311 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp index 2243076cec..bd4a244956 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:366032150babf06baf3dab952bd167ce95699d407358411dc7b27822bc9a48d7 +oid sha256:1982c8f4f67fc089820e1e7b8467ae43904bf1f9d0fbfcfc5fa568caf9d82de4 size 474869 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp index 2f55dd5eee..eaa65a09ac 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7d5c42d3d9a269dc997a07c00946039cb63d2c324331781ecbd39a4d2e1ac5a3 +oid sha256:febdee9c104e6c7a211bdb1982fb8c112b16ec199fef8c2dc5ac00639e3a7837 size 624070 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp index 08caf27b25..9856b1840b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1851d1b6eac7960663dc770a23aacc4f0c96582a132e3494fce1996bb2c9257b +oid sha256:b24f4356e5591d52f881769567532a364cb3db894a33c8ddafef5efd9bc2fcc1 size 584303 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp index 5e596552ac..1627582210 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f168573808c795cc538fb002b1972231cdfffceecfde0c9aefa2084f0ba07140 +oid sha256:2f07790c387f47a73295f842d1a79df69e31fff4c662dddc915dc216ad3e318f size 610003 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp index 22dc6b2b6c..85092b3901 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ff95c45add3055711a202c6286ef36cca198482ea1d880816ef573fd4643cfa1 +oid sha256:49d01578e23ffe5d1331edbe0461da10a077834ee7fa4a1f2a3efdfb4bf27b8b size 575517 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 146ca796a9..e71d375221 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e6e15fa81829268ea2d24030b2ff24629942022c36e3b9911a97425f23cf7b04 +oid sha256:f5f694f7f7c4d698aab72197cb9fcacb1b448fe0e0f7396c426da6e59a4ebfa2 size 920706 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp index c0f79446fd..bd41d07f46 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c44bdc331bef4e477d51d27f94bbc90261837a018a295c003734a45d4c76b4a4 +oid sha256:63716d5550c2731bdd335a1a7cbcababf7c7acaf9c3b6fa4635ecde24eb9fb51 size 875874 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index f30c05d8ab..0e0db885f8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2387f384ea45838dd41c7f3d7164652c429f6a661f7ba617245ca871c804afd3 +oid sha256:dc71b743c302521684054abfe5c0551f95659d335f3ee6cdb585185c7761529f size 747556 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp index 14c431ee70..29287382dc 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ce2fe1a2272ecea40d88fe8002321664b872b9979000ada604abbe61cd9034f9 +oid sha256:19d6c899dfd455b0cb8a2981841f9426e2e4f74785a0be3a11b42fb55de0a505 size 839070 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp index 79b6c151c4..4859f1a96f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:382b15098532c0a9936a3dbb0cedf51a191b10b0f57003d37408e64c83b7a4db +oid sha256:8279950fe32e24f8aedb09df2141e1f4a97f80795e39e7ce8058ac318e6be2eb size 757818 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 60546a6f24..0d9de4ac4a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:81b51dadf53868428c4cca413554e414f81db1e13b64447e3b85655c6981ac3a +oid sha256:cb029029a3f0c80a5a2a3a53ca4c6ce40436e774cc79f0aca7712e15bc3f1dd1 size 694176 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 9018a81bf5..60fbb44b07 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:df15c8bbf37de3e9afb8aa008686029cd79758fbce5d747d3f051cd3a9d32f66 +oid sha256:2351e351d1015470f6a932fcba64dfb1415917cf9e4637780b23c9c69703e754 size 945988 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp index a1e89f3837..5e50bcc6d3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1e54b5000b1535233d1c8ff02c91b924791dd54e22a8245f8c41760897f9b2ca +oid sha256:07195bc65eb79902ce38ae4fe914dc9d60e856106bb71beaa43785273d1ab972 size 629426 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp index 083efc957f..a3f0c22d52 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0bd1e7c3d08811c3cc7c3e1575223efdae55f3d163f237c92c08fc4f0830bed4 +oid sha256:294b4b24e42c3a1c34aba8c9e18ff0957cd3bc821ab20427519c13ce9a35dd8a size 526361 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 8c439bdc6e..3c950700b4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c60b9a7fc8a2efabfc05d0017bcbe4b2cc63d72f4369d959428ef9eee23cc388 +oid sha256:1812dad30f5b40440d977c3528fdaf6d9d5a65b0e3f469d3550a81642fd9b55a size 595039 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp index 4d77477877..6af369e9af 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fc07690b70ee40c40ff0693a9b9a3cba7e763cf38f75ddf456c065702e686f73 +oid sha256:102c642b73760c0118f586716afc4a431b3c6591da144b3c6c42688300948902 size 496215 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index c909ae2ead..1521ef6cd6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:429cc678c3164c611731618352de198b99659280f94ca2bdf6e060cba56cbd53 +oid sha256:55c3450097542ac9320b4e28d5ce69c91cdfe96d906cb19f737c4218f625834d size 688904 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..3f6c5d224d --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6e1255ddce06aebae213d92ebd980b1454bcf9806a8a03ff3e5a531e1e10adbf +size 710486 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..222ed3d428 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d84a75aec314cb9dcf738f8ac0cc9862782905dcb0109c6329c4ad81d4c843ae +size 607125 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp index c928483e47..910e5d003f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:458ac82e0f9782992e52852c92d2bc8862f46fb8278ab229e13166c4448c9f37 +oid sha256:9e4c41b53720b70550f5fe852f1cc60f5e37012b15f1f31e719d27f177f1a475 size 684808 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..41b98ba5d7 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:72c349b3ecdf85c159f08f6dac5f00d1d82dd0d83e6e68e880f2c0379e73067d +size 704072 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..339c0d69b1 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f04dbc0271bf099adf010f441b33ef6f89b0f4f7d0cdd6e5b3103bbe38362f1e +size 623156 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 779be94b91..ede7baaa8f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3975c4a88a4862075d0e7cdad0c73faf5f35c79eb1cc7980ad2903e05038242c +oid sha256:6b02ae961c09194d7dbd594c7b84ae5bf505545100922c806dd93619c4e4d2ba size 754450 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..8a2127e2b9 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bd236b467ce3b979842b817fd4336ec45bda78392479d4926066e91aedbb7ed9 +size 776132 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..4e7996b38e --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d3a55050809a099e776bc5188ca2ae8f0f97eb47575154da93de09350df022b6 +size 677852 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 354d2b06f4..1af7b35ec7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c6df9164f76869ad9b1d4d3c7ca2e564bd069cbe182e92f6193f6e2f66793cd6 +oid sha256:0d0be720ee02388b94e13e471af46cfee334393867bab314ba25065b924b133f size 830912 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 18649e2d5d..1ae73b317c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b0c70130feefbe7149cd40dbd7d2ce00ca40eb024704ae2dd32844df01dfb578 +oid sha256:2979c6666a940efb3a09825308ff221d66d128d1d6d8521379f12fe6c296bf11 size 796552 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp index fa3cda8b5f..776be75906 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2930fad8d86a6a1c598bd64cd735d6cbb5aedecbb20673bf4d2838482334fb45 +oid sha256:3b0666d49a68f8e33f0bba39a7d94ba797e235f6a0a02c210f5a28620459a07f size 762720 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 4dabc6efa3..00d230fbd6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:11e25971577ef4d1ee66fbd35fd16ff9acc8a50946df08965cb57d94669897e7 +oid sha256:14011d052ae63c8064136931c28873882fd5a314c6ae4b446ac903f44ca5ea27 size 836374 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index b0d792bd7c..b6ea091661 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1f2b7fe061bb07d83d38f9f8b349026b9a1e365732ae864f66c212caeb2e785e +oid sha256:58f550c9b3ec3d86e1ed9a496fe9b886e61c5c085f4422f3973b774cf9770d75 size 726154 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp index 4fc25f97cc..af6893a4a4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e1358baab12cd3f14ec2606356137e561b1a39274f690320a52e46be095f68af +oid sha256:da8bd957c7bc4fab3a7b8f0a3c5798c1cb703698c18c76fc028bdd79ec0eb979 size 922608 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp index 892860c7bb..65d7ca6f28 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3e9cf07bf1e2abbc1a6af087ebe6c938d28adee11df1f3795ca87d3c346b35fe +oid sha256:a90f8ba5df8a5461d7d1d798ef1fa8f225c86ec949363c16e858ae951fcc67aa size 710574 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 2adbfd5b9d..9e32c83bb1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ccb5add0b9a32ab66cd57af93de567de939bb743917b52542de3eebf70b36de9 +oid sha256:a231210add4f3d51b069d8e3b783ca3c7ab3f1de3ea3d8176ae112413100993c size 768884 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..aa59ef5e66 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:36b4aceedb9261bc0fd1088416536b0cffeb94892c57ec1138bf504abac0c2a4 +size 816506 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..7b4508b252 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:729d092b76ad3c5452c5899d9cc6ada904bc7a4d9f6cdac0039bc4558c8f41c1 +size 719952 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..3ba3579c1c --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:351d60624dd7f8c41d61d7fe69dcac29e5d5c94dcbe7a7f987260e49b40b453f +size 663452 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..65b75eddc9 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:401ca6aacfa2c9a2592764b58a0031e38130e17162505b77b05d0bf66b982bf0 +size 628670 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..201d0d79ba --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ce4ed366dc4af90f9c550956e5c77564dd5261401869e161ea10061fe51352e6 +size 645934 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..4ec5e2aade --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a4efe673fa5982c3026b8991adfa9014a9a4b8af101362cc11189a691cac65a8 +size 616775 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp index dcecf5cdee..ae27f62e0c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:33261ce171dd2a1d0a174c7cae94f5f0280285ca3626e2ec6fc8b5bb3c8619a2 +oid sha256:336ba319b72eece7bfe1ebc09de0586e54dbac0982910cec40b68510cc473ff0 size 796598 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..df8f48dd75 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:59db77c013478ab8319988561d179b66414044bd632b6722d5f3e2e3bb0bedab +size 819562 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..1a71cdc108 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ac354e305a4609fcb3232ccc7679274d5476fa6c126be678a4b81f87913c1e52 +size 733222 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..dd9355c170 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ec6b8644e90631acdd2ad66801c231bd8bbc88634b7fc12b79893b9db1eed013 +size 664172 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..c616a2793c --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1a8dda6282050ef85dc51f57d7474c4030acad8afb8ee2f01a62724d118d49f5 +size 565595 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..d7fc30dec8 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e1788a6f87d88d9af2f289901bf224a8b12451a346e223c4777bb3c99cea4829 +size 627862 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..cddee083c8 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d447e72393534ada7fea1cf3b0ce1cefa90319f91cb9b1fff0df80d7df08e826 +size 533429 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..c7c0984222 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c18a26b9121d9d09150a10f9024c8645794e057c2fed20aaaa9ec169a737fcea +size 689732 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..c6fc3b8427 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:540a7a144f9a1085b0f75e76a49daf2ec67829096f14d2a252102d362e2154cf +size 653322 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index c88d4c0d7c..b612bcefeb 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:65e04c00c1fd02acd5fde442a6ee2485c668a6ffdc7eacdc2408e768ddc2310e +oid sha256:b7182f1184d9ce3d0d820f37d284b9103fb0c7e3cb78ef56bf4ab927de970992 size 859556 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..ca443aad38 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9668f0d476aa4a31b49499f98eb84d16fad43d3497e3aecfd1e4b69e2db66923 +size 672164 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..7a4073061a --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bb11313d94364e2c15c2eaa71d7a286449348a67fa9d1933fa820b189b19c4fc +size 642216 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 3d8ee571a4..afec15547e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:af339536a04b93d32bc005249f65dca4d827691ac749b81abd915136d3ad1132 +oid sha256:360bae716502b9b6b0b0e8b3b51bd340fae5e54b485d9ab8b5e2c7aa9ab1d171 size 870436 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp index cba428b9d6..61ab681801 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:cafa7381794f5dabe32b3b4440c3df89a233916ee70546aafa7cb8a5c2dd74f8 +oid sha256:4edd82c83863a1f1d94a3bec322885d06ecf8c73e0676c5ba66c124d914aaf51 size 810804 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 477cb911ad..47b0226a48 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:34107176d6aa0174330e4ecae488af9abda55751210e8805ad6f4b0ede7bb557 +oid sha256:36788425128cad0c7df1c311078e0d62460a0ce92d1b86c14ceb89ff2e4b9802 size 864772 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp index b5cd59e4ea..ebbb65e549 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c4d98d58fe26df508395dec4559b273b4c94abb4d634822c52f1d561dd607102 +oid sha256:112496e6ae5be1208ff78555fb3f053d155a070f4ea22a9f3da6acbb2034092b size 957272 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp index 222cdcbd4a..18cb8089ef 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:71aca0ab442334eac9b2c739af8bdb3a5194046afb75b34b0f36c0d5d61671b8 +oid sha256:a4dfdb00ab145d7b4f0296a23e067c487fcb4c8189603a55ee068ee360af5c8d size 744646 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 2c75592aa3..b8e009d0e8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2619a74eb1527585464b3e01f3d96231abc1cc7d5a7c0629b79c160b04aec1eb +oid sha256:9dd03dbb40d1c72ba7476d24f396c99ffcc0ef89c18a5a188b20717b893b2c71 size 809270 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 594b3806fc..0b2c5fe24e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5d0579cc3e61bcc1cc9385958a0d0ed970b8eb889f4adb6dada67ac526184c74 +oid sha256:faef3949a97ebd6fa1f342255b8620ec336e19ecacfe6fbd727fdf7a75687a2b size 701912 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..904f64ec2d --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5f60d7508d1b10f19d17ffffa8f8b27078462b7b0d40781a057290ab24c62912 +size 890242 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..396ccb55f5 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2a9c71663b0a382af504aaff676c1c80afb25053f927b2870a0c6bee7f067b13 +size 787276 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..78174ce71c --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b192308bc3d7358cdcd3b35dfec5f5fa6b745db0ea68752703b0d2ccc7cabbeb +size 686998 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..e57e3752b8 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cc46fb1a0a8001feedea383acd572b22b24604943c059bddd230aab9498458c2 +size 585561 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..6f774eaf11 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:913a63af6fc1d0134fc9781efba404e7bae6257d46b15afa3110ec7f1e1ca7d4 +size 650934 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..ea1d575bc0 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3bbacc97142ab5b10aadcdd80b67c726cf3635a3c6cd59d4caa9808e68e369f2 +size 553393 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 6c3eca8c6b..359093c278 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d0838727867ffcb2eebe1be87584e32331187e26ecd2e757e5efd6bdfa3c4117 +oid sha256:ebe9edd911fcaf278f56b2cf5b28cf4764751e68dca66dd971fbdbdd9aa7a5c5 size 679776 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 6f03848b6d..b1e690b104 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3d0c14176164f69bd34180b9db363a862be2d2106b780985bfa64b3aa60efd02 +oid sha256:1c84c80ff29d478d28a97537444eb12daded6a1da0deff1ee39e5d5dd071130c size 716602 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp index bac68792e7..e65d4ce110 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:567c110a94aabe1e8a78d8dd2bf60c980aed8f637c6bf9239bb7d0c4d98d88aa +oid sha256:5876acc6ca6cd7bdcc6496b1a4e6756d2da2ece7d9bd997094340deb3a9c8656 size 681254 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 4566684302..2edb9a2350 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0edfb6527a73b87937b2851904adcba2c990f0dbc9deeeeb7c3d844694629bbc +oid sha256:034b63e916b36db9c877d455d6b15c31f3a7c768c82ddf3b75836ba430818949 size 714774 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 6610582890..1954200183 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2253677d5acf3844a0e3975d989d66f2a2498b249a8ad734588b5d6cb7113480 +oid sha256:8a1812496b30866ba4b5db366426a580530f268c93033a496755d51be0ae76b6 size 748528 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index d04df42dfa..065b466aa9 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5c47e5c37b3779ad1a457981997b743892b481572b636de4085b7dddd969c6e1 +oid sha256:f19632c85885f22b6337306236982aea04da3e1c6e467a37c8ac9c510a87e0cd size 784764 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index c889324bfb..6087f87d80 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e46e2f482e6ecf4f02e61af5834297ef44de04be6c8e96c051ab69a544036e30 +oid sha256:ce085204716bd9ed56b1d705dafb1ca77cd527c70c68dfe69674246dad1c0a99 size 769120 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp index 3e330b676d..d46d49a712 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e254de1cd436e0da65b1ecaceaaf805b2c92307af871817a1e3af535c7c7dfd1 +oid sha256:34b5e26ec3b1c36f0b99c01f0b4809cd95393a0030173a28cd63ea8df1ba4df6 size 744120 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 929048f51d..13c9348b76 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8b9abe940f53578539bcc2d33ee914ec6440b5f2057e6d5307901ece3e097850 +oid sha256:fe329d6f59cdd94ae36adb143541d65c4502bbc2d3147f6da225558b96fa8bbd size 678110 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp index 759c7ae512..7761bced1d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:10c5305fd82b6f181af2f3f2afbc4106d803b6b64152785b03062360abefdeab +oid sha256:5acd4d96ef0ae6e163dc88b44f7f31f24982ce899e910298bff213dc907c8307 size 752948 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp index 0f0c1090bc..fc7a8346e0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d322b5aab83d4c33207665f934c23795bae390ffe3dbbf74f737c8bb738d7c40 +oid sha256:6e5ad75c2d6322510ecd0bb327eaa41a3d887e763df94c116f3d7895124f1f18 size 669724 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index 7cf2177982..f7b36acb8e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:57a141ac228e85db05e1104ccce5974d95dc04ec5c9f9f30d2b7a85be7d3a4d1 +oid sha256:5e7f06bff3576051da5374596b8f326aef0195f39a2a4ab314df18c299aeb12e size 582739 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index c4190c9b88..57947f5211 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ea0e5627f3a92ccefc4255450210e7a263a449ee647489661a276af1b923539c +oid sha256:bd3bda6347e9cd69505c0bf60225d2a61e419797ce42c810b810f83f02593bd1 size 633560 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index cabeeeb15c..d0622f4641 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3c4412a61d4111df9c7d0738d33f487581c610d67cd0b09eefab60e53ad7c9cd +oid sha256:f517088379540b0de25692421d33259e310fd24d8ac0f53727107ba37f9c7ea8 size 799830 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp index fd16edabef..ed0a95b899 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:30d8f25ec8324bc4cc26215d7b6d49fecf309c724314003c44fee5db03e41614 +oid sha256:a85548c4023447931db1b7e305ea44f8174de8dfc51f57876a5c0bd6ea74a81e size 722956 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 823961a9c7..dd1470709e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c4ae1a6fc2e7b1b563d26ce653c2301505beedbded5a08ec8e95e960ee61613d +oid sha256:8320f5700c857c6ec8db7ecca42bee4c61a0dcfbba1166294a41c91524a0bbfd size 636270 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp index 4765915f1d..0f0be4c7da 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5d881426190068cc59d6570f04c4ba663ba49eac1f13caec37c6f9c74d4ba42c +oid sha256:a975609012c95bd5e36801ca76bc62bc2fd4ccd42c705406ff37fbaa12b6f684 size 764222 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp index 8972e7c3d9..9455f1d278 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5892ba71e4347c23fc97dcf875feb6b0ef1eac1f9e218390c66335684f55c90e +oid sha256:0ece9b1d6eb03d77c56c5efc47891f92d13b3a477da15f529fab90a0964e0ab2 size 675070 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp index 6c4628d694..34fe70aa90 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:abfb5029a8c16e08ccd274503d65e95e8bffb7f1a4bac278614c8f022f36582f +oid sha256:66fa78e764e664a336387a5de2e6b06d2586c57ce99f7331973dc98d3e522ce8 size 569913 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp index d80ee2924a..8332d2b2ba 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9f9213397e0d2507341a59de0bc30b3466ef811116f52585327e0d60b7ae2698 +oid sha256:ce3c5e8477ec7c5fcac0b04870a9626d88b9564d594a30b8e2b34af5b85f50e8 size 548205 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp index 78f192af29..03c9fee981 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5189856ee0beedbe28cea35d0f9665b0e3efec4b4547da7dd3c2357fc9663437 +oid sha256:8170352f6545c416f50703be6b7cc48ad486123a1d926b438c5a73896a61be33 size 560781 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp index 237c8ab91b..72d68272a8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e4ebbc8efeb31a727cbd9aa9104349e7913bd24eaaf6231a52e1fbef7f7842cf +oid sha256:857611452d5b9bea5632871586b76448657d3104766331a8fed9414b3907604d size 540355 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 74e5bd9162..655da91a55 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fd4fb459c0357bd09662ce57ea5207af63f327619ddca43ba4d11f29b1753976 +oid sha256:2a36e338962a6acae3f5f8ad45c2f06c672258a71607ce5a2fd4016fbe94ad73 size 760732 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 1f34376eba..434a7c9cdd 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ec7ed5c87fc62a9334eaec999a0fe9a4a379d16c570fe2085914e5d4a634ce0f +oid sha256:d0432c5f72edb982ac69264b44d7aa713d452968544ff341f395942eefb031be size 797114 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 59ee9e0bb8..96d838c403 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5b8714efa102411f846895b9ff7ea09501714b349cd6d2a8b236dd08dd98b2cd +oid sha256:02fa69d04b95b825957b85392c77368b6df94ce45e19bf74ee821ce3aa275b08 size 587901 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp index bfcb174f51..d6696453be 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8f78aabf47f4b9872969246e297158f31c413ee754c911db9d00f70f08af1c49 +oid sha256:0cb98501cd2d2d6db8b5f075b7bbdde1337d4852482afba188c8447b46ba9169 size 487301 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp index dde51bb1de..644e35aa2f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7e6a58117232af10af86a078e3f88be28f11a08886c1af744b416edcb0d5b883 +oid sha256:7acf6b060a072bd4383b31be79c63032801bdab369a55418042dc61d9513f5c7 size 560963 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp index 1395413dd0..c4454318bb 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c178b7cb5a3d259d37092cdd6fb6ffc57e23bcbeda0e7c5269db42afafa8fe2b +oid sha256:5aea462214367de6d1cda5dd851f2941dc6e80da608b22836213254dd9962d29 size 463571 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp index 10dd9bba76..ee77b2d7ae 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:70565f18a40f602a30d3b4a3c4dd238858ed2e7e7f50c62fdc057f1658bd626c +oid sha256:8daebf52e2b3518b976e7ede38dd2b3e1279e0775bc91e020f6fec7dfe92502f size 595403 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp index 291242b817..9803d90f6e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0158ed9c0be6aebf4e4590b3c8885842722c0e5df47e0328f529a5f1e4a194a3 +oid sha256:14b15e3b568ee4638ec40a65b902fc001814c254004e85c0db30b0c318decc23 size 573745 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp index b2686299d6..575ff6d403 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c6ce84dcad28dc36246cd44e49bba89ffa9b80d68ecea7d5f1b5684e577a9a99 +oid sha256:61aa835051da2dd51eefcd70aed565de16843312a8e7d249382b1ac7ff96bf36 size 586271 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp index 6e52857d4f..f2857101da 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:462b93507a9cf10a3fc47358673f0b14668e13846c80eafad0f1cb34b69d9855 +oid sha256:a9e9b3d4e08f60b58a84386c5c8d27dfe2c281717c45b6e2b3d81eccbcdcc7af size 565007 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 227a807709..c7c8441758 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ede497b1837d5a67e0e3fc5c131ed8ade8934d7dff2649203bf42956b5e9d40c +oid sha256:98a24476a66b9f7366a947de91346b7f0aea8346f0c2a65ee2326416f38364bb size 840340 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp index 23febd6266..0bc2f83b8b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f57a68ae7fadac82796eabddc86f7f74851883334c848bb46b086ac67048b93a +oid sha256:e73e5e090fd44e70f721837cdbcbba158c93e199574aca3bd544c2e29b6bbd98 size 792894 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 450ab0d210..bc7ffb9332 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a0f64c03660f6a28b0c5fe5d2a47cc0e30dd8c9d632fbc686d52ea73489ddafa +oid sha256:3e774b19cf755e6f6c5847bd744cc8b09f511a9348c10d25b599dd3fb355cf5e size 706656 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp index 5f2d6686fa..7dbb941134 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:677b151ffec1023b20aa5f148039ac97b2543dc00cc3435619ac9269ff7e9503 +oid sha256:7c42ba5f7471e511914a0f7069323635f07c659f08370c79833d91e945bd1d15 size 783764 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp index 453750743d..321dfb5843 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:72f5b70c0a94653edd25d6ba8fb1cc9b621058870df0f8a211e45755e62826bf +oid sha256:d4a0155bf691ae02efd022ba2dc2249bbd9cfd4fe18630bf99ec467e77d7dfcd size 708088 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index 45fc8ce73a..fcd2bb1d8a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4675cbed298ffcd920f41b112eb36babcb45f72fefd28a60822e9958ab10bbea +oid sha256:d7e8171b069dcca21f0ebc77f0255242561f4fd43f0cfbc480307b07b28ea517 size 608771 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index e5d021a7d4..30b62d6115 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ea33a65bbf85a513898eec2f86f33ea11e3faf22e0dadbc78fa6fa12e25316ca +oid sha256:137e5fd456cd57802fbe36171f942096e2c012fb239bba3e47c3d4b7dc8593dc size 667780 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 7b452b18fc..8355966b45 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:82c752dee11315a706ede1ce46e52805e2f4610f26ca6516055184b3feb25f1a +oid sha256:db5e3bbc150c3970edf58defbfc372cce556109d7898aa8fd91ada43b671248e size 871246 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp index b56707505d..619aeb52ee 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d5d04119006043ad825157dab3ba3a5f2376e58694e796bd3222619642c1c465 +oid sha256:4fc15b3366cee0724741adcbaf392e15eea7865d47b3ba6fced2ec8c0e89c3ba size 611911 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp index ec3e259107..c809d68df8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f77df0c2f2e59fd250707989f5d8f8bdc423f42e9d674697bf5eaf48fc4044b1 +oid sha256:135a4ad86a938d57393d7f15cf22d390dd8f335c8874c9af7520b7bfd51e3259 size 510227 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 8238121529..3bce4ef2e4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:db683617525cff322ebe5b9e0618cf02bc4ab1bb4c6dc07569466966ae809967 +oid sha256:648feec11fdf8cd832ccd47bb79a3dba983567980a658e7f6ad0aa54a600dbeb size 584529 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp index a63d9665ae..9ead3367e5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:91dc1b57c9b6638ce004e0b6de2991994c472f0784c2016adc39f2b477e77606 +oid sha256:c95477a6b48973ba7a9d9b82a82add7fa24e2077b1a598cf2cc17a837c060f40 size 485705 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 5e23316698..ff3055ae10 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a4908bc69143f2d0bea9a14e7eb40b9aa1678158253c77566bf306df4bb3f837 +oid sha256:5edc98a63983a350fc8854d25497b1d1249c41c609a37733444a5fe804a63a61 size 761684 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp index eefccfe66b..b3bb43dc19 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2755ba32b94e7114e45a46d8ff7f42f7313705c9d3a952cad49dde134b62ee0c +oid sha256:76a21f3514ab6245b453c28be239a1272880211b828676083ae97978002f119e size 669720 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..236c955ca9 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:554e885ab62a9e9304bb30e4a54b43a8f32520b478fef8c78e6a238ba096ba25 +size 789780 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..c2ffe9a8d8 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5899c9fd5850bb0c1beb13d6ed03eb07d70167a316f91119bb1814628e979d6b +size 697962 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp index 9f38884ad8..4feb94740c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f2121df04b5e93ab739381820ebd3063b30782d0406aa3f61318f981af3de685 +oid sha256:85ffba6fcc0f699c4da6d9c5161c9911e6de7c650cdb92abf8089f91deaa117f size 768786 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..6aa0c8d1ce --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3b64603d79ea4270007075d5a5527d158051349427cf57456594b9252bb6f1ea +size 787854 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..8f513601d6 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:93e5bc9dd0dfd52b8798f1d86b9354d6ea875b77563658e625395bb00f3158a1 +size 702746 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 74fe930029..cde70d3ca6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fb637b4d63b78419d793449425eb8b890b7a1eaef096f644494114d0c4d24533 +oid sha256:f84dd4bffca8bfd692c66ea06c7ece9f0c9cc009f4fc5ca460f1a5da20cdd090 size 866668 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..8269434f0a --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:97a188c46ffba0f035bf3ed73f256c0920f442f62832063d243dfbddbf09f532 +size 895354 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..39b78d88b7 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bf62b874f95717a59af6c5523a2bc8fc81b8588e3898f5835eaaaac8c6cbe5d8 +size 802550 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 43dac85551..1a8aa43b19 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0a2cc16678b2a8d3cd4ea36ce3d5dce61b3c528f20f064a6586ba93fb4be78f4 +oid sha256:66fc3bd42185e2f73ab36819bcd1192aaf23d3fe081c8656edf9d923834f31be size 865532 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..51909499de --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bfbf6f159fe18f2c97608217c831363efb2de27f0f89b7cbfe471e7bb357bf77 +size 894662 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..70c48f66a3 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5be297a6b5fb6b93b460fdf095c79fddb365ede326a72fb1fbff0285b26ed3db +size 821936 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 519fc8f35b..bd190a6f11 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:55db75da5753e4eb9eb22d80ac24f3456e0fb5e6bbf2ee0274fb8f0fec481dd6 +oid sha256:5eafccf69d6d74ea9c7ed18f003375e26e9400944ead8354796d2ef44d466a70 size 762478 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp index 5308e9a9ca..20ac3b7b39 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8059edae23a86aa137f40fca2b71bc51fff17b5561359252e6352459ff8c369d +oid sha256:7112ce4228bd30aa13ed2efe94c1881a4080cb47a0b3d1d14e012b0488470170 size 670512 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..96fdf2d917 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8ea2605e066b75129388f9529978c468e970c24d45e2a4b6c5b44477b8bc9b9d +size 789636 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..b56e64e69a --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0fbd6aa4cd68cda22caffeff3f3eacd4fd8d9c880254c4d148f5a65429d3ac0a +size 699544 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp index 2615613abc..c3ec2bd69f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5d23a79375755e4566782195192a2f85eba5d0f0aa33b3b24d2484bb4cdd2989 +oid sha256:2ea196abac32f1770706a33c568f3e0877f3d1fdd862a36853e2e79136091a9f size 769580 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..dcec6fd667 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2927270a808367b84409a377a51425d53ef45b7803011eae711570dc8d70708f +size 788646 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..48f17fbfcb --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bde93be61c8b701a8b76c9c0f7e89db9a39efa5f271010c0eaaaeb246f47631a +size 703538 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..2d52a4e037 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c3d15e89bc1431b876c3b6c44d2eb72a89c3feb87336d5d3b19446df67eef598 +size 734924 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..f69077c25a --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ba3d6c34402dd81aa1cd761b10654782eb4a041d65afc49be9497485b00bd54c +size 708676 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..e032d805d0 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fe1a0617ea02ca758f1cc52e5acc7119bcd3e47c085cf480668612f779dd57ea +size 695258 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..f8ad8b3365 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:65fd7773db121a62a6ae19aab870212b174f4332b2664abd882940ab875b411b +size 670194 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvGmemSepVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvGmemSepVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..9852d9bcef --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvGmemSepVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d7ee0a71a829476767cd64021c9c4f67f60d539ad186e13f36a352e6b1326d41 +size 759548 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..9a3242a520 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4bfc934f8ef3dd667d8179332bdb4d5e2586ca186ffd53f557c51f76389ea771 +size 721646 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..bcab5f9f75 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a4753cbe5f6f0de314bee645d38e8f8edce4d939827f725c68b8087946643a60 +size 695448 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..210937be61 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:035bae20161f292af84486bdae1b51fed28ab60b046bd650f030f8b29f0063d7 +size 688000 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..cb21667402 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a3f3b34c2a818d311118c0160b682c97fdbf4afe50f78facf9245bb07e4afa0f +size 663626 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index b0cd7e079d..31a41bbbb0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5eff2a6b6d53485d42d8e6d2c3333d66acb811a1b11f486458fa221775472431 +oid sha256:30946125c70e83f71b244f00f8ac48c0a097cd561cfda1c24b046b0c1694536d size 953772 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp index e24290d0b8..a633512d0a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:74323b7d9ef5473cab34239ccec2fbc05bbf98c3bb0c8106443900142baf1c6f +oid sha256:308b0acde95a263337b5839a155a9787aac5ef285a806866e29fcb83e3f72c84 size 894322 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp index 32dd688add..73660cf371 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5f4bfd222cd7446e484183c1a5f2da08ab79a358fc2f1790f8d51f09b34b6ac2 +oid sha256:3160d995640b4e602cc840b82488b205757b8c63a7d05f866f01d414dccf372d size 768228 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 68ed16b73b..4b5cd90175 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6301c7860e64356ba2f51a0cceaac0f87be1e072398740fefef62d070d296236 +oid sha256:c765f7645c3378a9462f28aac2d7f68d3acc414eb0cf8ed66650c74c306d74d9 size 915388 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index c951073887..40d541e009 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:86f0fbeb6d4672652b90fcec2f09a7956e496eafcb60ee27c17fa1f1f102100a +oid sha256:84ea959c1037a0672177703f3ebe37c8affe35b0b6ced86b20d442a6dbb571ab size 825742 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp index f97c25938c..eb1b973ae1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c69d199fe95ed86051644d3b7be43c50ec3d4de3301e22c807136c01148dceaa +oid sha256:0af654bcbe175cfc5b60b3765d7cddce0bd6ddc118748439c1c98c49803c4ef9 size 859442 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp index 9b3b9aa01a..f611155676 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ffd78121eb866c74aade3feaa0999d27bb9d9afa48444641d40a0f6780c7cc5c +oid sha256:d5b4a1a0f4f4e0327d46a78086a18e7a53f97542ab0da0c979b6acd3c4d5855c size 765996 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..02be524ad4 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:970461770a3237f928ede202e4e8d371d15a5e07f8645e543c4b483ae59524a4 +size 730908 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..8e149ce4fe --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a09846ee076872c3904c6bbe346642a38349722d0c3d4937e2c1b22e25a1d988 +size 638696 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64PersistentSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..67628efb5f --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64PersistentSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:16f3fbee451a5000df043a9a5ade0a9047a22b5dbb41b9d7263e3e72c8d55586 +size 704216 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..d26306008d --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:15fd1e9a53d59d4e75c9741533a12b986f57bd55fc6946f6605b63dacafa1b9d +size 612941 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..de14bf7588 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ba58ef4963a26baa4e023ef9a7ca9840c46b39ba3f7a6aaf122b8f92720118e3 +size 794054 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..d541250efc --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1b1b10790c025efc61efb597b58756319afefbe76fa393d424bbfaa2a6371ddc +size 723944 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..35ce620c04 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1895c55279b6f51ed2bbe4f00c5628123b2af98ac8690fbb953b48c66e39bf23 +size 700614 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..dd0fd66062 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cb62406ac11da4d09411f4d3e3a720b3067deb5917d3c6ab1d9e230a649ec0fb +size 610425 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64PersistentSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..eb04bd78c4 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64PersistentSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a9ce963284bb696e2f9f01e6e424403db1dbc3ad22c93efcd56106f27991d6b3 +size 677130 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..0ff732ab66 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e49981dcb72a258436517fa9e6941bb525800e2b2e46689d173c63876bf3ac16 +size 587285 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 57671f4c66..3111ffdbc3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:13d58dee76e66cd7b4bd4d7f17de624e7da3a15a73627bf0ae719a7d1e437549 +oid sha256:bc0418c01771d7a439d71e774b5412fdbeddfcf75c55d2fbab70cdfc4ab24d76 size 895706 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index f5839bda4e..395780cded 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2d1bb2a01a063d52fe564ac6c25ced80068789f0a93737d0eb3289c0ffa8d66f +oid sha256:a1ee4d639f32a6546ae52f6d2bac85121d821a296767b0b2acab2d7e18dc76ab size 865906 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..d9e5a6d923 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d8e1ba70fe70815411791831a622c2f4950e12edfb4d7b8d3299a5bd210e528a +size 772416 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..8074a7d414 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1e494625b1345a28d7902a37b2cbe4b6f6d7d615abce2df8ca3bdfd3eb35acab +size 742666 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..cdce130cf2 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4a6120a187a7843f26b6dd1c8479f6d36152bde7b7149973fd924ccaf4911528 +size 736846 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..8d8dfc32ef --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ccf20094f73104121345db7110dd56b1b383f9f96d7d4f4d4e6e73ef92a031e0 +size 709710 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqSkipsSoftmaxQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqSkipsSoftmaxQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..14e6585792 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqSkipsSoftmaxQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9ed8329cd74d6b6a8b08408419240c990ca73433483a39095e8e47df64fc6f52 +size 871494 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..b27b3dcde0 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bdb23e5efc427e786877014febed01afc49bb4e2c689a3271f650fc8994d7df2 +size 839912 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..ff88c66ccc --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:38f51327a6ed1a7e2558cbf6fdd0dcbc05f7753dda8df0d0d752300eb9ac551f +size 754946 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..6a311d9e91 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f83f86f12aa0fa780f42805d115f877d005279e16833b418010f96330892f489 +size 725986 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..cfc3171ee2 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6d2e03e0d3e577bd23795d5cd3809d4bc7d5b78b977af9778e4f182e2e5fa975 +size 725000 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..fae46696ab --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b33c148db7f8d4a72567b249735be37e9e1e6deb21154c259163ffcb1efa4d67 +size 698654 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 7af137cf38..9d3dcf8992 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2ef8c145ca3f5c89ccb47bc6bce909be832b7cb2315b6b3fa1948137831ade1e +oid sha256:35047e33b638cb6a5981a535fedb8b195542bf9f37f055420b525f6bbcda0510 size 1034332 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp index 6b43d86299..320ea33224 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d60e288b215cca3c408e40e161642eb6e09fd7472347513e690adba8ac90d8b6 +oid sha256:a56f45769bef4c8ef1cbf44a5941d7aa52dc3692a106726dff5de82d724825d0 size 951550 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin.cpp index 982faf3393..11d67e8df0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:eba24a064f91004aaa6d1e7213b3ed810a4ba1768ff0d992da036c343bfae2c4 +oid sha256:5045d8929a26d4d9116445f0a589702591e108a5889196b08844e2f65582aa8d size 901388 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp index 8794eabaf3..2ea97b8e75 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:dc937df68bd39be3e3832de40a93e20b775383f0e950bfa31d938b22712c0bca +oid sha256:e61a7f0768c64af541aae65d4141edae26d80cf032b0a6c362e72d8e347fe5b8 size 845830 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 3576a37cbc..51e02f2ca0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f8666ee0e3ad288b630719dd5a3259a56afeaf5b8a911f8d1f9eed9ecd09a146 +oid sha256:f68587d86a19f3df5893c40991fc65860fc81f7ef2aff95aac85b200a1c07eab size 988550 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp index 7cea7ac45c..e5f1665faa 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:aa5959f3812c7b70139f077d1bfbdb380b984c0dceca61c23014bbff9cbd1452 +oid sha256:14aad8e36cc5d52b78ba6309574412790e9af2f34549eea162523448ff2b9eec size 909368 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..cc1c229119 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7d41f4a773b1669f477c375be7fee019f08c9a005cfa4e84ce6b2f43b88b3db6 +size 765540 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..e3cd1f3a5c --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:252338b6a2b3cecb4730430228b7a201f1d3628dfdac5947ab71ec410e35f012 +size 672982 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64PersistentSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..9b0cd0664d --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64PersistentSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:670c0a1fa742de67bd245fa72d2e4a5e637335478fcfea724702f5274d1b1fc2 +size 736086 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..fc3dfb7b4e --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:34b719f2a324841d34f37a071c39f02aec1f3cc1edfca5d9108ea6ca9657da55 +size 644218 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128Persistent2CtaKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128Persistent2CtaKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..cdd9b1c63b --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128Persistent2CtaKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:43e677e5ac4bb50a724672d9955b4213dcce7fa26cbe1bc7d60c950eb81be568 +size 925634 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..5d06a01cb7 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7721a51c6a4c8e2474610c11f72628829537597255ec5fba32766544adf37a0a +size 878414 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..29de47ee40 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2b28832b40ce0950a33f3dbdf6e990f3de00118988671fcfa1db6559b736f07a +size 833866 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..e8dd997908 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:50f5a75a5e5686cb2b054927d7b3e046d5b9195a9d5633d07feabe1f61d537b7 +size 804406 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..2c5220b056 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:51eb2dc668adf798cf72218d0bf08fff4040b03bc65b9ff9ecf22d7bfcb46595 +size 731744 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..5ea4e51c13 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:348db9e6a0b9825beb73e122ddae346103f89d0d923df651d6f6ec348f6a7568 +size 641210 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64PersistentSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..05a2beeba8 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64PersistentSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bb563249002cf1bec280d67769c6f742dcd23939ea85b2b8819148cfe220bee4 +size 704708 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..d7767a1102 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:91d785ca42abe72c1bc0ac326c1c66e716a5097b3786b1d05e0114208f70cd8b +size 614271 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 77a0d86072..7236279a43 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:22d352da23d920841daac461fc44bd803404631a6dde1661926e509b8cbe6ed5 +oid sha256:a76c69a9bf2f2a9a04cc938339ea993be645129893c54d33d99d56f45ebc2d1b size 969212 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin.cpp index 6e6c9ec64c..6bbb202f8b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:091a8a5f4689bf2f25e80fcff489a6a456d9f4db55ac6f09ceb310928c434bc3 +oid sha256:c3d0788074c4e10e9d846b0d68b71b8597fd179c088c98019408709223e911e1 size 1142578 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 935d19350b..f2f560b6b3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:bb796e59ed8149eda5442464a83e45d51c1a4489217924b7dc4e347ffadba79a +oid sha256:1702d0041fea82ae53b37b53dcdea16ca1ccd952d9851413887dfd782efef408 size 937834 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..acab765524 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f47167977341b3f472cc0cbacb2234eab2f04702dfc9a2f674806e3610d2d517 +size 848418 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..52ec22c07f --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d73fbc1d63cc0c8537e4c64f563fa0c16210bf2c319edc4b74eac2fea23c07e4 +size 807716 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..b3e029966e --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e20be028bfbefb5685f7de91d6b60e3a284e35f30d4cb4ec66a7bb1b97a04653 +size 804262 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..8398e7e321 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:44461bbd8179119a915e98e650822658a271808a5d2ea68f11bc4f545cc7983f +size 765386 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvGmemSepVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvGmemSepVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..ef2329604b --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvGmemSepVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c8bbefb0523a275413f521d74d0107039b3110a422563a0f3c39178ddf4c39e9 +size 983154 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..f27adbd093 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:672f99c6b886b3b23c2989ea45e921d8d43c4410fd4c5dc6aef2b31fbb570725 +size 822758 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..14498bf5be --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c7c0dbd1802089d795235cec572c0f762e576810d2159307a843ac110363959a +size 782056 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..9de359c21a --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5535261bc7e4297aad170d57d11bf7b4d404d01af8a1b0fb0401d1a7058fd8e2 +size 788026 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..685ed1fd2c --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ad00997d34081674001e4a42746e3bb8f0ff376a1f3437bfbd071eaa5170177b +size 749938 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index fbf005e23b..eef2aa8c4f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:22477d51f4a39ff8f0b5b34d12512cfac5bef3719fb6a70c07d5050bd27bd690 +oid sha256:4989aa413e25e32657ee1e68d258576c250896fc92667d8099126b647a0698bf size 1162972 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp index 71fa323f5e..25e3c61bde 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fcb3a6fb28b86c141802f28578d66c32792b272d5b194a2725c71759eb2d4dad +oid sha256:f782d3f120541463a6c3a7babca55f314d18bb2b88c2c01e67c0171f22dba7b6 size 1054042 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp index cdfc69f2ca..0fe7a61b49 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:bb10d3eacb83caee69277f6d5503e09d136dffed8a62d596ec579515cb70416c +oid sha256:d2fb994f0cf6f62bafc62008f45c8407c262899861fa17411b07c23554845755 size 950838 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 4d8d13ef5c..79e854d3c0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a21967b65ac341a22c127a7992115f9631a71d25b79f571fc3cc626a032d6bea +oid sha256:412034acf41793b8cea233726776526ea5f5799a86fd7e6f0e8d4677c0cc7fc2 size 1106088 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp index 5090eaca57..8825f20519 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ee1b70104eae0421f1efa95d17a3381aeebbee663e07f5cf1a4a39fd1f6afec0 +oid sha256:d20708098892872b975272d4eb89d965ff975dd7c1d0a84eb77785b167b6fe01 size 1001846 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..f4ed50fb03 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8b95df5bfd19f69ab52f9454ccb63563362f017262784abb19a48533013db1f6 +size 835768 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..d0b45c282a --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c32aca986e2d450ed1fb69ab52b5983c01417519bc5aaf974a94bff1a0a99ece +size 741286 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64PersistentSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..75ef73a3c8 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64PersistentSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5d4bf31ed74c5fe556618bb5c25ba9cbd4d909a54484dfe0cf8353449ad894d0 +size 797236 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..682b9f2672 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bfc674b17a0bc51b258829030ee0ee6bf974e672e2cb481d1cebfb10feb15b11 +size 701028 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..34dfabb0f2 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:36e877cb748c0a6c42ee25786fa634abe296c5765b764639ec24ce42e6c81eb3 +size 982386 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..12671efc45 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d0fdc51ed734f3c0fb4b443f73610ebdf639dc2bc390bf370a4828ad91770a16 +size 947598 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..c7cbf4c2b9 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ebc501cc21f9839866b4c0451aabd20b81e7e9e5fd773c49e53d71d94aa16c82 +size 790576 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..f618208754 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:625ab87dd7b943e688829cf263e78ad3bc4e49b4ecb443228e1e2bf76d9e39aa +size 705074 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64PersistentSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..0ffbdee7b3 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64PersistentSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9f3b73f7d17cfcfb5e51dad4cfb5c8f76cd520c2ce2181e2119166cf89b0079b +size 756338 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..1cd8843097 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:30cdeb48a0630f8b283bd5fe46c17bff17943b86d44d777205775b412f23750c +size 667480 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index a4af44dfe2..69fdf41bae 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:06e9d31935415c9de451c56ef105aecddab86c4e1075a32edafa86dd5e2db12d +oid sha256:c9ddfecb344d43a2ba51260d2323b3ec403b65a913507fab9a59f32ee5900c40 size 1102588 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 07e9f29b9c..a98aed2315 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:dac8bf7adda06e80c5f250a0089556e5a1305d437a7478bf0fdcdec8ebb2360b +oid sha256:4f9a5f85cca38f700fda64ac3f06315169f3db9f9c2a1764f90d46d3610faf7d size 1065488 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index ad657014b7..079df4c92b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:76ca6d763174134e6eb7fb10240666e726d51116df6edcfad6d8c0e6016f4a5b +oid sha256:667fc2343ecf7817e700da66324f72cc4960e38af4539d3eb4667bb2bf4526e6 size 691906 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 6879a57e03..029867100c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:743266d7f51b9d59054be27a2ef1e6d2f03a6901eaf85d5adb9260c3d7b70a85 +oid sha256:f240a1a83d5a4c87ea766c57fff6e77ce895cd09cc595d27240378e4a9c21268 size 716842 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp index adb374c36a..c0347acf52 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d9ef16f01a96d87af4a65a3bad02359d528946d0547f9b30196febc2e7417923 +oid sha256:5cf81f5a4e205538a6284468c1f75df75d7293f96790ded992e8786a1776a102 size 697626 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index a1f41f1e28..2400c511bb 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5e8a8cf3f75416ddd3dc1d36a646a7f2713c3e2c2f08a2bd555850b7fbd721cf +oid sha256:6caa2464e55e06f1c8bc1805dc4de2a55f1dc24015b31d507c721a5db02bd919 size 715508 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 28bd897a9a..49c76f6611 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d6285892d4a6cef4f7d428a87fb4c9a6de4bc77a299356c6de600278f9b1a4b0 +oid sha256:528d65d16daaa662e91c942331def9a452e0ad57ac2391400e0c9f1da2fddea2 size 761054 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index dd25211344..733ccfa6b6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0fecc408daed6ecd5415939e770e6a4f41a7f2cb82d9bff8068d7a8770e219cf +oid sha256:00dc120e7c632b5a24d006038f937f3dc796347ec406fdb332599677514d3b3f size 785004 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 4f9379e668..a6c9395c23 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d563d6227cb63538a50b4aca32d67843606c9886afb91bfc9ec8f23a55aa2e3c +oid sha256:a2fa09c6697365022036f139b89a80412f7dafe88e7c9cf20850a938daaa49f8 size 776366 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp index 613842a591..e9a07964f8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d8997dd1561b77258b12e3dcf6c3c2ae81c74e51aafefd5721660af0eeadefed +oid sha256:1fcda4f1da8f4d405e3c3ff83edb30c42ba1f9a761b67aded8c0bb91a8e4fec8 size 782248 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index e514bbe419..b0fad51c30 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:677ecd2357210f1945b53e83301749f40370f32b7068028603145888e525f27a +oid sha256:d19c84a7b052c6efa4342903410733ff3651d8a28b365e7c727a8dba888f2ad9 size 681310 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp index a12b4b210c..1d06463659 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:01b28ab6295487a12e2db4b9f77d57ea3910e0d5367c99e36b672af6e2d82ce4 +oid sha256:d188b926cb91c390cd3bd0138b2e2e25df9a246a2e0f8636dc7f832ae52d0ef2 size 762810 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp index 42cf923b0c..9d83bdcb1e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:dc2280537ce6d6e3a81b45cff7ed1e8415db65ed7781790802fb3cc9a3b9f31d +oid sha256:fc12300e3d8ca0bf9159d46550237fa4fe23a28657b3a596afa4c31ab5848025 size 679930 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index 0ad267af51..70d6c54081 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9b80903dbd59eda64a9ff72ac146505b4aba3aba16766bdea38af54186f7b6c4 +oid sha256:85cb69394686a9d0b446cc675f789aaba1e12d2f2168c45701f4ca4be57de5db size 591269 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 754b968617..d2f92b0146 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f02e91d517915f5fe4fd8ff3f5c2dd4bb600883001c38fee1f5ea79ed205fa2c +oid sha256:1e4006b7340f12b7a586d629e9f47468cd72bf5f2bd5846e678de2a633c1ad9d size 621122 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index c84a50c3f8..732531bd86 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:61b8302c74e625a0898519db90f0d9896fdf5088c6fde29730a9faaaeded48bf +oid sha256:12db16768f65c4abb1fb152b4f8df3adb2f541f9ed02c6b1e663c07c5ecd6edc size 801156 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp index cbba522ab1..9a15faf25c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c1f5c0aa113e306853b4cf15cc0d183377e7027180902e51b2d84ac4312f46b6 +oid sha256:b02dfb0ad923411c00f14877b371e2d265925732454bc1ed5f3d4e80360b1725 size 763404 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 4986a12356..782c6e3f39 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:35ace1043736ce8ea06514ca8ef73a3fa85b7ad45a95a1f1ba04b2d0230d8fc1 +oid sha256:e2e1bc4944ef20be283a64de954692c1a164d18d6b4141e6b947758c55b25910 size 679628 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp index af8d0fdd2f..23a77a6bc6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3ed55e07ccaeeb410d8beafd3fff1ca9bb4393f7bf71977f63b94faf6b6cbdd8 +oid sha256:ef55377a367f2b1774f58327ee8305dca73d79002346079985f0540e8d5b141a size 792288 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp index f29599fd71..bf8ea3fb43 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:396f6cfad5e0aacf5ab39739fa9eaa9a54686a873ed3f1bcb1a572af977610d1 +oid sha256:c1b4b1b9996acecfeca85e14963e293ad755521c0f6453e62763db40d9675e29 size 705996 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp index 9ad6153c87..0fe4daecec 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c8fe789426484f83ec0910a0461c54fbeb85146b28c23db80f6b3e85992a1683 +oid sha256:1ca45215b78a7a4c0abe33a97cfeedddfcf6fb9601c20cd3146244deb1524fae size 581995 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp index 5d1d9572b8..ad476384ce 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5227ee27e522ee490c56b5001f2b87036e7f9d9b7d82ff162d54e56f72640e93 +oid sha256:53c7361963f7d82b4b12f53acc276cac9954af6512201a516329d3be47e7943c size 543117 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp index 56f44aa315..09ffa6576b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:00db53d58539a0320c926079aa0c68bbe306cc63cf567703393822ae46d6a067 +oid sha256:98ef90e6acf9882dd504d4964f9eb5f93894e12438e95fc45e42b66f9d1ef18f size 567929 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp index f936cde656..1e88e21d9a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b60e730d062dcd9bbd9492fcf6bc6099c9a570a8763989c20e9f2118ab0365f6 +oid sha256:0b588a00c5cba994307fc23c96f51d7de0ce9ac0b307e208186a92d5fd82d807 size 535119 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 0e2e6bc011..2d8d3579d5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4e082d623f48712d3cce34c75d211d16566b36883c004364cc5c36e9f5ef6d28 +oid sha256:2923d2f9a87e48990c7c806429bedcfb4d1d201b07834378f982424e12855bf1 size 776364 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 29bb6a7b9f..2fd04272c1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:390535b6a0863d95685f63e38fbce2661e4c4f00648bdd6ed42c7b126f8db109 +oid sha256:f689f8febede036701fc4771c290dfe6c290a3ee8873efaa5e930b73f86247db size 797650 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 29509718a6..162a0b44f1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7787c4bb8b65293f4ff3540d7fafbfdd67ddb290a3c147f7b85dbcbd1cfbf79a +oid sha256:cf65dd48162134934aaa25a29006e8667f46c26359ca3545bf29c59c8a75a7a9 size 589769 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp index 2568726515..8d06968ed8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:74362539b024234b2191e03c53e35742806b6d376f2bc90ffcde8d4d2a293d20 +oid sha256:0486d986844b2daa658590766c4bb79993939726ba1a30b326b334387a330a9f size 486161 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 32410f008a..5d216b305a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c7a537ea0bdafba5158b5025514278e86f3f0408dc754233e0ce2fcf8fea39c8 +oid sha256:2d5e68590f2e57072e8e97bcaa7de5a8ae313311c9b150e5ed3c321a7bad1765 size 559673 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp index 8e17e056db..2b43a60709 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b1aabae8ecf96cc4af8b6fc7166506cc50b5ba5d83d95ae23ce4120ecd251124 +oid sha256:3c81c5382e00752062feabfc074330aa5ac2e8c365d00fc2c0fcfb789884e716 size 459125 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp index 370e6d4750..00cb5c27af 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2a0ac32e27b870bd39aa8fe68c365bd96c04b63284708d45839383feaa327451 +oid sha256:679ae4e69fd21ec683a306dc38acdf13409ab8d2d3bbcc47a5d912a7c64ba5e4 size 607435 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp index c7f23dbc97..d9883f247c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c21f55085367058b6c6ad4f3f62bf1990700f111f29aac9acfa8a864ad8090c1 +oid sha256:25220c4eb9ecb892b7b63fe4d68c194b713ccad1c664ef0d60050b4a19fa4811 size 568559 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp index ff3f1a7a8e..99a6d25229 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b47326ec34cfd9dda16d938eb48e28960cde3e54b2af7f20b4ca2c88b95ce058 +oid sha256:5d8a69f650caeb4c6a36c1f45d684f41d3a638dd56a75a97cace4b76c87741cc size 593369 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp index 5d9d2086e6..f934dbc9c8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4bdf242f2e1651c2337ef1920955a55041a21026794fb55068bd21c7f64f043a +oid sha256:425acc7097ac31f8763c27ac48fe9cb0720c0b3ed7d1b2d45907b133dcaa8288 size 559771 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 50b0204b45..36b78ab85d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ea744d35dbb924806e1a3468588f11b66fe444936d77aed99038f3bf0aa742a0 +oid sha256:6f4563bea81b2d332b580debde6bac0d0932fb43b48ec6a6506fe573ff11939e size 847536 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp index 97700b4ce3..490687a75f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d1cf4b6c61fba719c682fc917405113e6bf963fcb36f881b1deab49cf420d177 +oid sha256:798cd835dfc82e252a815a57a940ce9cadbe1141bf090e783ad7d6621f572363 size 831368 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 19dd87ee82..e399ed943d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:dc2694b9d12b94cb8e8100236e4dd6a1297a87d1be57c04042f0216ccc752be1 +oid sha256:86726021a351b6ea4281ad65853a8e3144ac24c4e1b9d635c5c2ca4bb02ecddc size 709018 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp index c17553e6e5..cf2e969972 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9bc9f896cf2e1b2c33d9960589b5aa39a04c17c6784f7fc58f9e60eb2eded8e4 +oid sha256:0b81c167582f9ad065478cfbc4798cad5defca059f53196badcb4243465c90c7 size 796190 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp index 4902254573..ff81585b71 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c11ba3f3277c1bdc18878aad57841c20a2d039f7f5902b327b13e3cdaf90daf5 +oid sha256:26f79c519d526f0cd580bec6bc82ca55b1597d199db1eaec5e782b2543855fb6 size 716714 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index c6ef153ff5..88e8b73223 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1984e6a86da0135e4ca5df1602f07106c3a3343e2de884ffe692b353a75c087b +oid sha256:fc6c1e9ec0a761cd0a6e5c2c913c21e003d1d1dc6cf04d434ea75c0437fed71a size 629880 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index a7e7b154b6..f6865351d6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:919d5100f298b806807c7ee54b4d0ce5674f197643a7520ff8e5071642d8eebd +oid sha256:3374ac6cf21bae095663958c7e1f9899c8435cfe5a67807e6b3e81b3a5e562ae size 657020 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index a3c2382277..9bcb95d84d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2d4518cad61581c773c739ebfc58535f0697205d88eb86690fdd6f4555995ffc +oid sha256:fb3128f78561c952f2b9a4f06a407520af9bf98e2d9067014cd704d8b64ff94c size 871832 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 4127ff99bc..79a6610a85 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:bd9df4df1e81b62e5a755a31e71c1d9b1e9577f4121763b5d6ed47c21ed51383 +oid sha256:ec08791257d8e49dbd2242fcea5ec7bd75c40a0946d23c1f9d886946caf057c6 size 614421 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp index 24add1af54..64edd8bd9d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:37dc47c51314ba4807f383e990a54633b48ba99e5c429614f84afe826529d99f +oid sha256:57ce4d700d638fbaed5ca844b5219c291f4cf5647ba5af66cb18d47c23eb3ce5 size 509875 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp index e8bb07c4ab..d8f4424fcb 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c3ab5e69d1629f00cbc330855403066a8d21975e277bf2a2fe51b5cf494135f3 +oid sha256:d6f35633d5c2619d45818637333c97d2b844b841bf566d24aa2bd412c316e31f size 582351 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp index dfe9f8784c..08a0d5bb15 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ce7f0fac32f907203841f592cd6da97fd44615a9220d0aac4a75cfe6f9a39b49 +oid sha256:506d1894447ec85ad223ccba1365a70ab2d565286886c34f4dc95b48e098ea19 size 481259 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 51e092e221..d93a20dd26 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:12520b0f2d4a7c63fb8e5c939aa171ae711991da9e091ace1e2cdab0674a1444 +oid sha256:dd09e38cba8c24fb1241bf0e2beb8cd24835cf8f470f4057e2fe502b50f5eaab size 666696 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..23d68e4826 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eeb297c0dcb28362294f63b2ba5af45e3289a8bef5a2baeeb90023a707ee5ed7 +size 682112 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..a6d54a3529 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:23494b5ec76ccfce4edb9320e5824d36f31a5f0cf3762162df2b85f9f41709f7 +size 592217 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp index 5431bbc79b..b05893d24b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:cd99fd2058b6cd4d1db7e4900d2dae8a793d29e92e5557d466905b631cc7cafc +oid sha256:20daa33e521b4f2e09e2dd34e28fb7199747664314e574541fdd82d9307d2258 size 663142 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..91bd5cdc3a --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:18763120631f02986b3e2ef1590abe8e89ed612dc3d23cbc3859c473db99743c +size 680482 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..cb76d6197b --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2c07c5b35379391e2d40bfd5a9f90b7750866f7357c6c6cd8550e0ea2bbc9bd9 +size 594831 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 45fb5ae86b..c09a02c748 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c9bf12cf42e99748f39055efbd50ecd73c807cf0428c30e1e29fd54b8a62bfe9 +oid sha256:f1dd50568269d24c00132ded9d21d0420b6d7785059ab916a8917f0f86e5a00e size 731454 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..01107cf44c --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8b3b966e52a1412e4067599d352ee95d1e105ce78a2b48b5771a2f1a1b7d314f +size 749582 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..4e1c802e19 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7e15cd3fc83a52e82c54a713730d0133dd2005d304f3e2688947fb2eafbc232e +size 662206 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index ddea7d9157..c8ce6931b1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:84f106c2ff45a0215ab76cf46766e494a54671cdd5be4b9b3fa25d4bec8ce5df +oid sha256:4d83abbe12184558ee387bb0bce832d406e722f3c5c415d678a1f8375fcaa178 size 769588 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 1c4ebfd307..0e5312de3a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9d1747c8384e43af5358cf5560082fa0cab4ef116840550d0ad51761e765279a +oid sha256:e4c8bac67b3ed63b5099e3ff3930a52c45100707276305527c7e9bd322686487 size 752808 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 0c11107aba..801a7358fe 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:dd00c58245c30a3bb1dfc5f6aa779a85ac8b4787b23ff2709783204f24400e4d +oid sha256:54462dcb64a93bf6176c9290379b1ba80bc040e897f8ad591b8b3704da38a026 size 754216 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp index ea9aafff93..74bdbc571b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7ed0d006ba6db1e76a9a633cd0fdfd8a07c880d972f0ca1c30f4ecdfbc34d25e +oid sha256:be1219ebbbf1af07622543c2f042b1ab5391444376196578caa589a46618f772 size 719496 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 47de109864..eeb54ef656 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f201774ec8c1ca48a3b11f468bcb8ba1d0e2e8a06ccdabb1b0f8092cfcadbdc7 +oid sha256:4477d058de9aeafd50879e24f2eb341fc40bd176876cb7cb98f8daeb969b0840 size 778202 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index c2830e2573..49e0f5d019 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a086e76556a46f671affec9215687b6919b04e040d828cf8aca3d4f198e87d75 +oid sha256:e8856da891ffc2bbc08585eeaebc9ebf8e38fc93655b30fc49cc1d0566dfca23 size 673508 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp index 4b8192513c..0026176827 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b0e85e50c3c4e8894db27cb7ea18ca3d6c3aca15b01a5bf709053226899d5238 +oid sha256:751bf495045033547d58e78e5f559e008236580c600d9815ed6de4260560151b size 868284 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp index 6c398ffb4d..590093bdc2 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:896a05c45cb412cf49b86e485612e5c67f1b3ef3b9c02ae0292165f03d288b4e +oid sha256:4d045b4fe1a11cb138934f5b27912c00ee67d4b01911eac156ea466448f6c545 size 665622 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index d88a478e20..fc8443743d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:52ac857fd562a16b94b98b2827420c1c5412fd6b6c70d24f78ddec40755dc96e +oid sha256:869575de197002d6df290b19fae76bd7eba6f1e43c0e33ce660591ba2c72d406 size 715792 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..4303a5c816 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c7b9bfd248f739dcde36f683ce29592a080586ccfb3008329901d83da55af09d +size 770766 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..5801ca8b9a --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2ba879ffaacf04fd439295af3bdeb38b1385bc078a377bcfc719be6f5dce636a +size 681564 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..1f1769ed98 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:408125096f655e20cd296f4ec8780c3a32ac9e36973761ec22a4ddf45b6fc45d +size 639420 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..3f837604cf --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b4140bf3191a2f60d888a6fc026f181984596fe76ddf2603d5d9cd7cc6720577 +size 605475 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..fb6ef4eedf --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:77fa54325600b2d6021b129b084c4df731ff9cd1d7f936ae9d8171126c854fd5 +size 621900 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..e04c8fbfab --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d208f4efcd39b327d3d0c964e580ec7ddb2dfa203037540b4afdd72b57ee4aca +size 593579 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 8d544bc565..19b864d35d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:83b9faf68f501c0605ef6aa228ab10be8cc6598102c1e309d332589cd66ac4df +oid sha256:87efee2ec409647e45186314c8f0cfec1163a397f660a17b86af2ca6ae1aad1e size 753522 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..fdc9a19665 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:96ff3a35a9f6825ec9f4a165a868046e8b26b93aba52b6baeda35653516e97ff +size 774760 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..c261651dbb --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8e70ba77bc9bae3c29bf99e8e1c6137d5d06484f0ece21dd02a1be3920519f65 +size 685804 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..bf495ad335 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d82f841cb84a698675ea405df238c958cced8a81e2b30dc2a5e7e7b89207681c +size 644382 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..ec2cee71fb --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9cb70747f75c7784a4c475fcd89fa7f35ed105588f8a4e2e9a930420ceb14403 +size 540131 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..4ecf6358b3 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e47333cd4492a6c3f70d207772b7cd4d98fe9129c4443fc19a74233545306a9a +size 610241 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..09df958255 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dcdac8d1618914e78affd44d25af8f11cd42a64946fdbe41681f7493155534e8 +size 509593 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 60a786104c..233bd3b995 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4e02f8a4620ef98a22d0c2fa2a4acd12016f2200de249d5b861923f6114e8cdf +oid sha256:9cec9a7cdead29aa66e57e0214f13296300d206ca465f503d53050877e94b96d size 798974 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..20af45ca5e --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:78d0ada682f8ea70e3d6e16031551839ffb8c962cbbbf93e0ee6534a0bf169e6 +size 665698 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..590b1f1d20 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:446942541b9df58c5f247a122be7accc8bf9840ca403e9fde600734080fdf2c6 +size 630176 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 38f4e58f8e..e8912ce85d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7005f7b4870bb8406629863cd8b74ff33adfa2a428c5172450d4c7f32fd0941e +oid sha256:89e5bcd04c80f0c6d85daccc06018f7f059fa124dfeaffd33110e07c7462f977 size 781404 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..d8df7051f4 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dfd0d45b3509f503ba1fd66f14b02708fa039b1fcf152c5bd5d833bc0ab7ea04 +size 647390 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..81301eed60 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4f1456211b3dc9d103e77fd3986a7857d4ec4ae7940e2655fbd3991b54ef11a5 +size 618478 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 1cbf105a0f..341e88ca9c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0361d133b43ba52ab558c0f5ca613010eaf28be1c6aab0a0b4adc15204235e21 +oid sha256:787ae64540e234dd8387ab2798b347d74928e0be4544fd9f6a226a969fb93b23 size 821488 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp index e34c32b938..ae8abd8fdc 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a3d8aabf5200311f3938dfd0927336abbf2c542b3b6c3d94df904b85fd668361 +oid sha256:f107c49168d30297cbfba4d3515707dce9f54cccdaf74f25165338f86e0213ea size 766790 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 726d574768..0c32ed06ab 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7917810143d1c4eaf627ef37a4a05bd7cb692e8902fd01e73b72d673d00ec6bb +oid sha256:214ffe8bc3d1c61ae85e7f562b1bcf77142cf63f024f69729e8bf768fc3d2d80 size 806748 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 04becf20fa..7215f7be72 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f5517a266ae0717c3921c77c4547cf0d3934119962cffe187ca9a8405e114203 +oid sha256:bf90e4272ef080fb68938f29ad422c52f84a59184df385129e9e8584fea87dab size 700822 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp index e89b4b6db7..4ff37272b4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1c7598fac68f3b003545aa4b1b70188f3db0c99271959d1ad836f40a5a09c658 +oid sha256:f4c2c807d9845339f80128d2be790ab1c76d2591d1e01a30f8a0c8cab5ddb054 size 904280 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp index 0ca6d28301..5f3ba634d6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:cd423cb8ebf6cbae840ac38dbef3cd63f30c475ea4e64204535c65704f0ae019 +oid sha256:a804cd605741d5947d2328fe0db084c1cc18c2af8067dfbeac953cce9ebcc9e9 size 699744 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 7f14fa2a94..bfb6d975df 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:10e0400bdf55026240fb8c6b4f9f8386b24390cb65ff9363b52c3a9c292de44d +oid sha256:c65f8eaf81437985d17dcf24effc3db61e7e05835467563d5ec00ec5007fc9f8 size 752824 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 00d0853c82..849f170ce2 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:12fc580cb6bf5cc590c691af86e8ff1b284a4cd18568438bb2db5e8c4c93c704 +oid sha256:95ad70188e48a2ba92d5e980756c37170964dee9b5f222b604a0f8fdcb563c58 size 651584 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..99040edbd1 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d35bdb6f7f0ecb28d8fb79990b75668ad6973db9b41d70fbd4d3940d151b7f23 +size 841640 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..6b18982657 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:25951f23d58c5a8e4aa7fbf4d101ba59324d146752694363e625b6fa9808a0ab +size 752784 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..6f97bad20f --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1e2a278306acf675fb7cabb2af595b408fb02876956a0589545a944037a7c6ac +size 668638 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..5ebd3b757d --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:515ffd5b3d34c5b6d3526725e82441868e67669449fab313c1674b18693891a2 +size 561675 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..ef56d9aa8b --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a558447663fce97d29ae4e205b69e51fe194385cf8a07879b590d4bf41fa06b2 +size 632524 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..aec4225e92 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f993295d002074be6bd798ea5c9ce9ad6e0208ade085071b8c4b7ffede013a03 +size 529557 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index c9935464dd..59488b0c90 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:51614d07b50059f772bdf877f92f37186732eeab9987779f273417ed0c2be3de +oid sha256:0f09122a80a3da1a7d5922c062509786ac20f185b2109d1619b8053d7630fd0b size 652190 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 19aeeb58f3..88800e286c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:33e1f9914d943ebac2bfd710c385cbae94443d46bbe6c1eb753c5ba5631edc05 +oid sha256:cc6325e59eff3b27ef0f626958bf311939940cdbd8337bd60ca3e5e4677abad6 size 678804 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp index 69da524f35..b1cd0edc67 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:81e94d842cdaf62b2947a04170834c62a24374d2ecec5de5f4f1a5a3ae2ec72f +oid sha256:b4bc124505c7f81f747d18204f3ee808ef9699fcf88b5953a7ceb3453440c885 size 655394 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 5091589d6c..08dee17763 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d0a36f397a1b0ab4409af8a8543b0167e2d97e2357ce9104bd9e948a56b953a6 +oid sha256:e5bccde9b75ae69537490d6e9661ba2bf99fdfb6a42664b68f3632b6b3300974 size 676582 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index e9a6e08deb..d2daf8511c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2f312224d032c1c0b3bf47cd60d77fbd5631865cb27b5c5143e279b39657a572 +oid sha256:3849eba650fc9cd98265ad85a54eef05c6df33991bb49caf3690140216c95988 size 721338 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 0df2c867d7..8048432dc9 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:96b7c351de35f4e483edc22a48e98597f54e9a448ed2302b8929ae91bd5c33b7 +oid sha256:cb8c88771ccd7eaaea38618d936aafb93145eb2078c65e3c6d614d7646a76333 size 747804 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 4bec371576..091ce0ed2f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ecf8b56e5cbed7ba107f3b5bb0f1ae577d50684faf66f5b9cc235ecca4c43e10 +oid sha256:85471eb6479b03816020ee2815008bae244db550a2c77c6ef1bfb7ccd50acf69 size 737292 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp index fd7a59be17..f8ed0027f3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2ff82d119538a6aaea8c12334ef83a66ccf1da930e804d0c58bc7c2a54881721 +oid sha256:b6d01de05904d61fb441c67ad9ec576bd370b702497b44a2d71f5ab39c0eff22 size 727536 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index d6651e9c60..7239b36cf3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:be3ac00e750514fb9439a446569b0a4eea4736efe79c0cb1a75088332d398efe +oid sha256:8e676980cdc766629db5877dc7232041456ea8e2a30ce289bcd008d5841286f7 size 666016 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp index 6cdfddab54..e0b7dba5f9 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9232f1e4148f3a3fc3bc9421b57c55c573d0a2293b58ffc2ec18018546470787 +oid sha256:1dfdcae7d576af517049dd34e1f775da44c194e79cd5e6f34fcbb97f59d6cf53 size 740360 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp index 882896afe2..9dc57f0280 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:230724c44ef28fefcf8018903e5f7f71ed8231cea987025637ec6ca15a94a856 +oid sha256:cdc1f3a30bf1b0ef47c2cef502f11f780c5ad98379243baadbafee3a32033a68 size 663746 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index 4d8580e95a..7ec26539b6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7b088325e8c3052bc1c41f54cc0ee42f6b2105cbc988d8e54abb2373e2aa6e53 +oid sha256:c6a8ed2f5fc92f5a952cc86e0a96fde783de8b67202589ed2e887e974882fd7a size 573555 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index b9c540a5dd..d620f15e41 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:511be8f112c36c22d844eac50acad2e6a4dfe2494ad0bd689c8c16194d961a68 +oid sha256:7bf41d9b1e6cc262613346033571648062386dc5b56c3cf2633c94805f23335b size 623192 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index acddde5c1a..66702bfca9 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e89d39723091b1ec8b340aab64f70ed980bbc691462cc9c6a936f07f8e78147c +oid sha256:f40c3408165cdc4228c860bba86f8a60cb10d672be98efb47c7777ec34cfb850 size 763018 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp index 4d733a84a5..d16f9bedba 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c943020afb8baff3a918c2bdc81043df9d20086c15c6e1f5347fc5bb129ab101 +oid sha256:4ddb1b75a85cb8e259bb93e31b64c9340a8dab478d7057f28b54494e4139bc30 size 707210 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 8f95d301f8..85979cfaa4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:edb3d5fdfb2f60bc418f3bde67b164ed6388d012e0653f16f7c284b9e35d371b +oid sha256:ea097fdb44ed6107e7d4caf4c9e96b937025621f836d31b167d98964ba2454b1 size 618896 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp index c098526fff..2f22e3a6ee 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:61b21f02454ccdad63a2b395dac062b20638b36be6c91f2a83734fcd4080ec24 +oid sha256:b2aa1f26533f83de01af9c832b0508d16b4198dbe0933a4f330f90ca8a15e6f1 size 734466 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp index 70aacc8fea..1cddcf087d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:58aa224a9110ef5e99d787245abc7bc25ad01aa34e02a9004ee0eba92da8dd4b +oid sha256:43f94ff78c995eb60e87a5e47f8f5844818c5281bd185460c8554c877daf99c9 size 646102 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp index e45c4eaae1..cefad60f2e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5f8806a0b34491a3b8896c6411853584a63d61ff4080ad89390e04214f74d278 +oid sha256:f32d4fdc53990b7c6954d6a88cf427583a9ad2c67dbfd3aee23e94084eb2a58f size 570301 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp index 11e5e3791d..621852bfe5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:892b7e0d0316cef1dfcacca5244db0b37a677af9b4405634f61ac905bd7d0a37 +oid sha256:5bab55a152347e4def1aeab361e3eb2dcaa0cf61dee93d87b802c3e49623f179 size 548691 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp index 2f33b907db..a04624c23b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1ff33a63f616df44b6b0db1f928564c99b7ef1c32a3778bafe97d4e3059bc323 +oid sha256:9db5ad2ad73d3f93e0bddb33f0b979b4c50870b9c0e7e04ec7357173fa6db2f2 size 560379 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp index 33271780f8..d014912a14 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:eff8151ae790734afae08bb9e956eba6326e381367f1a5a01d4e2cbe87973a1f +oid sha256:4daaf03407517f711031798944f8da8c7314364efc996218b9a2535fe584e5e1 size 540001 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 2c4e6ba266..6d20836e5d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e24ca5787a69b5eaba052b7f655b583c3c5c4ea67b07858468fed00cbc26c1e0 +oid sha256:54a182012ea6040fb9c065d7d1ade0db94cb132e499f4c1c77fedf94234ca295 size 735070 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 31663b570e..a7eb8b1cc1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0a9c55c5b07a81cdafd46665c19edc7c207bff906befeaf9f5b0013ed2e12309 +oid sha256:ba48b662606f1a08c512774fa358ceb408f0a6e916886ce5d25a5e930762fa58 size 761092 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 15e33a7f8a..8d0bdf30f6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:56632d653f458ee36ea2eec0e6815a8906568bb5b95307b51f65452d782df6e1 +oid sha256:5bc4f4ba42508a8782b5d3fbf94e95871502a94ff8752c2b1ca734a48d4e4a60 size 589965 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp index 0afb04c2c5..3e379c0336 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:49efa6b48c4c5f4fd485a2912c361bddfe5e5874e24844852dd4e08242570a62 +oid sha256:9f77adcfbcc903eed81d54dde560f802b0a076746bd6209c1b4e349c0e4e6761 size 486061 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 6e1fbba158..b444de3c84 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fd5cf28120d20dd58367b963e9cff124bab9a2ac50ac47b019986a07b423955e +oid sha256:f0693e30dc3d139313f7c899259925f82034a61f8775369297b4d2e933e96423 size 565247 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp index 5aee0fd98a..0e54bb2e92 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:efd20658df653c1f2b1dec355ada844202323cb0009943c03c568f4d71636c34 +oid sha256:7cc8635b4edfb0df548a2da247cf18b2281d3eff45d66ca73ef7441d1c40e71d size 464795 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp index 8b80e10502..196bce0644 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3b2aadba108292bcbcdc4b779164669f29f7489e4d14dbb6de95bcd254346a4e +oid sha256:ec6d3244ce1779fb328746de68d974f37965dc2a661f37867ba4fdd3f98c5805 size 595741 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp index 228d14e917..242fcf890b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fb29d37871b633390147d3055496e7e28e73a7a22a785d7d8517024098eb8dd8 +oid sha256:58aeb3a842b83be9369649800a9f907fdb9b4c1ea0ceca758901913963a59fbe size 573391 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp index b51a0bd425..eda6aac28c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1007e90fcbe04d835874b63ebc002618a09bee1193d41ed0e2386910c614609d +oid sha256:743e337266e22935dfeab47e6dcc153acad97c5ad80cad65e3dccffe53507b27 size 585819 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp index 500898e674..f55406f2a8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:23d18157c621062a5f22e042b75ec465774ca663244a8cd12ade2c9a3d907c0e +oid sha256:df7917b43bd1057890895a948c4bc26cbce2603ffcc150b49989d2110bfaf6bf size 565443 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index db0f0a396e..cc789e97f3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6855493fc4b7edb58a81c0313d71582c87b0e51f2dbd651bd536da2a1edebcc2 +oid sha256:68e180fb73b795b22c444097d7d7976622a5063829f407cf084feba09d997517 size 808462 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp index 9e73e25182..4a2e2e4438 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5ba6e440066cebcd861b87c69efde3d1dd3cc439db08ace5c20c4def217962ea +oid sha256:6389322b02df3261c0fa7a651c5f58875978c6aacc25c6775bcbdaec867342a0 size 776358 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 1466d1c9b4..1675fe31bb 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:06b41b8572d0a0a1c01162d5335480157896b455b1c4b17b9e9409787936d46a +oid sha256:394665d4e71219ca1e12564c9e0315ba14c4ebcbb0841514c9b6af9a363a8f1a size 695352 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp index ecefbd5492..7f456783da 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3de4fd22e97a3c173cc8c27c50b63951152b16583b50b733edde1e3ecba831f9 +oid sha256:5f277f2a272c98dcce794457d5b80803fae9de15288ef691564c48448b1a8034 size 771966 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp index 0efa2045bc..463a140a87 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:94de631dfd4f33f76cd0d7779c8d363b6abbcadd61d3fa6ab65066e8c9a85612 +oid sha256:1f0c6e46d71e4a76ee4e4787e8e9355fde469cfc67692d0452e505441187024d size 700482 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index 9a9c9f1852..909451dad9 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:19c6252c70af2480f3171005c0fd972ceb5a3510b32a391c79435aa249e3b51f +oid sha256:2aba665d4f2945ebd86b89ea7d5feef24e7c31dc8877f64808eedc168b404999 size 612955 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index d901893ea5..ca11fc8b8c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a2204b0603c0b1c6e7262c1688397dd7e4dd9420ec613534920d7f000e4deed9 +oid sha256:13268f581f473bedb8aa6d69e0c9662a4cb77c69e519dfe7c3766f908c687507 size 657362 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index aa30d1b709..40d108ad14 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5ec8c070a3b032a3f5b32665c2c4f79a4cc3230f0a65b76ca53e5d1ba46632ed +oid sha256:fcd72a319989d850221feeaa31f3e11b6ee44d879ceba5f2bd87e08d9b53375a size 835226 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 10a36f5783..977047e29f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:78da08e8cf62babc0c1ad6f6ab63336973a7876970125b77ef5dac10420704da +oid sha256:64bb75138a04b5627910eb79daf28f13a9b4e7a2e05a89d2e63770f03cb771be size 614665 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp index caa7b027cc..c611da7c4e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a7513f8aea222000e1aa3212aa9cbb6a7e16c7ada1d883580086821844b890e8 +oid sha256:c016c39bf6103f30ec90b32b7985350c85247a02163d84bb3606e334b7499d61 size 509775 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 4ea5b4f564..5a718d2694 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:eaa5a1ad731b8fb5992929766e60e9e5b60724ed774d6151fcf1c5a76a98bdee +oid sha256:cb294e7b3d5fbf32f1c7f7029daebd53abf1f4ef8e3a58fbce6b67cb04301898 size 587233 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp index eafbf70bbe..8d7c3e44bd 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:408fa7534aa9d491661a553d262138ae09f56d42009a0ffa1745472620ccdc05 +oid sha256:98dc15f981249a315e69c2ae094667458e74b04117df012779229dba90462b3f size 486141 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index a2dc7685f0..852421d9d8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0ff16ac6162cc607e0ae6e3711646dbfc424fd3b4fd8fe53c95764f24b377c36 +oid sha256:2207759b0e3c12c8e3b49b2abbdc5a58c2690fc8d2c27078eff26baa81d4de88 size 692560 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..ffe4a79826 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f8c985f5c03522504b1c529c6c3bfc0767691b16b814c67c89b12f9cf6d99711 +size 717498 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..408d416dc2 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fcb8385ec4adf699e726510eafe4123dca6b2e4f3befdf2506cba886021e085e +size 628048 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp index ecdea7bde8..39dd08ec62 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6deb0e0c2250e4efc56c9348ce5cb59f4bea2caba5cec8d12c307dee303f48f7 +oid sha256:1866bf7c66a5a117ab1ecc9405559202a4b4938161ca624cc41f7cb4cfe290d8 size 700650 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..76bfe2f255 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:905f74201b1a17d6407d43a2925a955c3f8cc99f024433628c76049a76e1e2c2 +size 716262 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..053fdfbf6d --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:39cde4e8bcafbc2341af16c9e92e6efaa9466524117de5f2b62559b910dcc6f3 +size 632930 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index bdb964075d..1449d485d4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2b0cfe8fa032cce8129a8ec0fc4c77455421fea14f1fc3ebf806296915ecf28c +oid sha256:c33fb9538dcb973b38c5d4ad68b4c0d312971302098160fbb5199c9d08ab04e4 size 781510 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..7d1c4fbb66 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:803f560c60f91f301b27d3e58d16789ebf27ca0ddd251d07ee414238c0694349 +size 805510 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..74f3f4c4be --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:428ca0408c44687012e1d63a30904c9bdcff4f9d3330df53b893eaa40f51a86e +size 714582 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 9e63884374..ca1cf92055 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2971af286b2f4594338967c07e50da203e2129ff04145a4d06db5d58eb50fe09 +oid sha256:cbf0494f1ddfc3eac13b8aa5e4cc91857c1a50bc479b5d274d148ff99ea5caf3 size 785160 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..8d773d924e --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a30eb19e6ef3a6d3609ec1576b100ae2e8381b3fb0e1ea5834fd070f72dee4e7 +size 804226 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..c989a90b13 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c3d045632779dc8acdac7c11a321f542fa9c1c1ba4cf9b9ac514fe970cc7226f +size 728294 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index ce09b485e3..37f9feb8fc 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:12a4c06ae4bcc36c1ef243fe8b20d8091d9c3c5244a0a51b3c691117ff6835e9 +oid sha256:9ec1a75cee8b1a3bb8e76190f9a662e5d5986f114a2e54c345322e0a029b956e size 692564 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..d44d5a046f --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:977167edc014a4b29d408ff8ba01dd8a8f33f3c5a36fb146a0f35165faa9adf8 +size 717404 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..d8b0d0ea4e --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:247e7b97d4f19b96daca4536ecbc60504c607bcf20a1ff5001d9b17b84d9325a +size 628842 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp index 757bdd25d8..96b111d654 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5b2ca6e1c3e64c123da119da0953d3e09c298b7132c22f6537bf9e76ea58cb65 +oid sha256:74d3f33632ad878790e53f4265dbbef980bef0b40f3fd22cdc3260ded6955e4f size 701442 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..313108d5c3 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8fd6191ad6c483a1f7e271ce9c9163e10e33b110088f148af6ba6c2a7e861595 +size 717056 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..bda0742ffd --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:550e16e6b856c411887c2b9b9153fed99ee4fd90e9c1bd78c008fd814a34db40 +size 634514 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..210201cd9c --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:66140ce7a8142714df0caba9e16ef2e215b258889369d759e13c82bd575d79ed +size 681784 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..57c470c4ac --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5a4a9ebbfb7b4593005d45b9169d0274138c1153595fcac168d93077be4e41bd +size 672062 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..1379eb929e --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9dd7f8aeb198d85d8c67442cc74a081e592df4d2a137aa6b90d3be2890a29589 +size 642956 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..e9a30b253a --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b89b004607f3c124be8316fac5ac2b4d646b4b5aaa640151095f9bf59be7847 +size 635012 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvGmemSepVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvGmemSepVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..e4c0318c60 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvGmemSepVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0e6553154bdbd79c5a4ff00523d127e9bace90693c8be62a6331e6cf427d775f +size 663094 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..a72b7a3990 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:98c16fc4bb762df6403080573fc261b6a07b5f44955c00f44d6a678596afa566 +size 668506 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..77cbb06536 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3d289fdbd9d3b69e51b4ee2c383119966a4af9db52556c94ba5698705c30f663 +size 658786 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..760028cbdc --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9ace6d731d8332cd0124c7ab7b49ad42711bbb95725831bfdfd29843c7d3f7fc +size 635698 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..351d5c6028 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d5d7a023b5a98719f0819178d0baa0e6ced8eb8aed32cfda129a9576fa2dc3b9 +size 627852 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index c11d0a9545..4b8a7347ba 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e97acdac79e67cfe6e305f7d7d728d62b170781d4a745d9fb71ea462a389ca91 +oid sha256:c50c4b6dd195d35a7c64bade298edb494c689292bef19aa4c67c24247cbae788 size 834624 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp index fca74fa6c4..ce1acc9e2f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8918bc9dcc64b2d4d1f54a7209295135066497603e384a5619ed4ac17dbbd25e +oid sha256:ab474a4fc80f0eb32aa07a69fda2f6cafe9703a337c1313caac7620138033449 size 812718 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp index fc71c3ebd8..877c5f7f2b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3b3d0e1bd5d2dd5f1a566348a41c70399c5a2aeb34249427143fe1fa0b786e0d +oid sha256:f630610356a4b72c13e9a984466dc053f33774bbca575cb98d62a493489126fe size 693826 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index f3e52dfe7e..9e79a5a316 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:83f7e7ca9c842e54c29681bb3f72eb200cc25ef593ff8454fd36591b6f3ccd18 +oid sha256:e6a4e42552792a5f1dfb5150f9cb08264a119dc12c8b9702437c26d8a7aad99a size 797178 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 5344154b2a..c8a7ed38fc 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:748ed82680e0dc7993e68d05c8f7168e1e54c897dfc0337d20098dfd831f8e4f +oid sha256:2818636b287b6f3b8095f5c4db1e1f3732a8e320c4cf0766ab40b32ee350661b size 704966 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp index e4eeb0fab2..6e88568851 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4cb601b416c66a213d06501405e0d549130d94c57d9bbc6cd7f7fb6375c96984 +oid sha256:5a65a0b339625974e6ad948815d6a9a7a094540db918d040f548d97a14bfecef size 777688 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp index c9f900d1ec..2ea3299cac 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:32f16fd2f74e0e3440167d6e87ee987bb7472b5633f21103fba668fca6f30e2a +oid sha256:a255cc9bbc78b0000e7024abe312cd3641f2e16563fda30e58fa3afb3ffe67e6 size 680396 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..93c8892dfe --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c74be274338178bfb94edfce366b61bb613b6c730f2d830219e23c7ba50d5e24 +size 683786 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..d0468542c9 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:069b044d987a80da893c38d910f60b31025a0c62c7900b80306d1285f3d93bac +size 586739 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64PersistentSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..42dba148e4 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64PersistentSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8a137de8893936af12e756e6c37b7f89d4135217868fcfce1c321e3e90504dcd +size 675890 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..d26778f038 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b259e1345a597d779b88043aee6127e710b3a329c0e4a407e1c4f85fe283df98 +size 576229 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..8af36a5134 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:df17d8e03e6e9de76082672cbed67dbc7a3cc1ebbbd3e42d79a42fff09b1d1fd +size 715606 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..a752555b0d --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:634f9d8e960700bd580fd043ad6a810a5bc26be5c0642ca16c7fb9da7a9f3dfd +size 626700 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..fb0fae50ae --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6abb0e52152c62338212a398b7ddfbada18b4f46df19d8873edd8d29d21293f4 +size 653198 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..cbdaa2f42b --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e345bbb7ba59bc1558027e598b88cc781d9c00a02ebe1baf3374b38436132971 +size 557827 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64PersistentSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..e11f742541 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64PersistentSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9f43471d1fe3d92c398f1fad48aca96e35211d2ce8fbb15c9f1f12e9b00c544a +size 648262 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..5589c70018 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1ca0603950d9ed4bc8056e65f95732f7537d8a7302aa73d249017e91d0f93a0c +size 550721 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 051f491509..d5b9c62d7a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ce2f07a8055f1c1b33aa1a55a6785ae589cba66790063bd6dcf3f7a10016830f +oid sha256:49e40ceac472ba845a5bf4cd7ab86b645a607fa23e40d63426aef6b910bde6c5 size 740396 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 6c7d5602c0..77fb4a4c5d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4befd30b485f0cb022803893ff1b5b56c65a226a2c86b73cbf13ada82ba92a7c +oid sha256:18b5c98f027494a3f4b1e76573ed484ba7ca51e64c418269c5e8df84759e34cb size 712472 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..335431f28b --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fc0d83d2c9ebc042c3d5b64082430c834963d309c60bb84d5994f0886ca9a12a +size 715774 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..582c8998b9 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e5b1c923b609b0c65f190ddae9df450478e0d86409f77b1d192d513a44cbda1f +size 701810 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..2e69a6b8de --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:94eb4d8ef30ccd6b7107f5aef27d32a563dc38c3a3fa3c8d6b7be18e1026dbe0 +size 681830 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..8791066304 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:500c2122aa200dd9dafd43a3bddc1f6f713ad5c7a2a67f4406497290253acef1 +size 669692 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqSkipsSoftmaxQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqSkipsSoftmaxQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..2be8b5effc --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqSkipsSoftmaxQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:173c1d5ee863d7cd64572c3222601d80ff87d944208862277fde7917f898bd53 +size 686536 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..0e34b5452b --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:38e079c59ea8fda5f680b91aab3c29e1563affc9750c509601267a1c0639a2a7 +size 717854 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..c7cadc3bc1 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6a7a961225962f5b897bcf75e350b59d220432d359ad1949d5de3a275761aa14 +size 698304 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..91047b6ae4 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d8c0d0d02637e4e5ee29693d3e6f3952d31b5700babf9b83d195b02fde937bd7 +size 684292 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..6a1cf9d8b7 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:46f1c720e4c96d3877b8ae45a9c2d4a4b02d317e7ef0d10bc7c9b2c82c31b872 +size 669984 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..bbc6de844f --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:53723ff1d0e1b4fb016da6e19262c5ca9ebc892db746bc1b72ee427e5a528ad7 +size 657846 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 2afd3ede61..effb24bd06 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e03a2b4327c8a9d1599240b12c0226293eb6f88fdded99d7f88c93d50390d44e +oid sha256:0b2648ee652d394f425184715e1bf7bd1a47abe98e4c2617167e779abcb8b795 size 890272 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp index d5ee5dc91c..08159645a8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b2080fabfe267d4cd0f05c502fe12970ed93598824bbdae3ddd8f604008cfd31 +oid sha256:fa065dc76a687a8de339ed4d12fadf8823e3f5f2cebb15fcdb8e67fc5babc750 size 859486 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin.cpp index d798537e53..1f2ab3d740 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5d5675ccd15066c43313fe2def03b268f904146c840a2703e2d4c409262211d9 +oid sha256:cc4b67edab2db15f32adbeac8b0e8ffb6b565c824fdee65921afce9689fde634 size 718796 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp index 4bb210ece7..726284d1d8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e49f281d3c45d204e8ae3e5f23af4465ac11a1b9a49ef2aadacd3e7fb5fc4553 +oid sha256:efcb6451b57c35b1eb873036c3d6d1cca53bd87bbfe11aead08cd27f0515791d size 756084 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 824b97bc3c..304b7f3f8c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9d4c8a5f46ac8dc084b9d5960db7dbef244080908b38f46b6daffba58cb0ded3 +oid sha256:0301ed1ee38cc1482c802271359d63ec656706f50bc7631b222ad4e5d68ecd8d size 846166 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp index 08d1741c0e..652fd97d3b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b82e41154c4126360b74a28e9d02a7949633bbd39ad483e5c553a3a08d70daf1 +oid sha256:fa6fd2ad3a50a468d7e419bc941235605e08878aab23448ac40a42fff2adefc4 size 817156 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..b8f196dd57 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f33e5f35f682b85a51b410fbf33174e9fa89972505173bdd1f1de5d7cb61e22b +size 714620 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..d795252528 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e84db8d1799f3e4c52b5e2d88d67a8910a5b91d87b7a445b5af9b4ed37dafb92 +size 617325 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64PersistentSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..b9d011560d --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64PersistentSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ce1bb248d6615aef916d9e32794b28303bed9f898c416997fdd275e68e84cd3d +size 702334 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..5797164424 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9a1b5b1c8145f484759234da2000af3d309748d9d3a1687aa52593f13bbbc2fe +size 602523 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128Persistent2CtaKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128Persistent2CtaKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..c920745bd8 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128Persistent2CtaKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0a1956cf265118937321d6e9cdac98ccaf315b8d013bb4f4b75eca2d8fa83bb4 +size 739048 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..5620974e42 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4e8e8ad6c2af06e21bf3ccd8d0cfd66d1b7915edb752cdb0e07347027df2166e +size 783636 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..8b6ca28ca7 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dc5dfba4b181c1de9e9c854e384d4265303f44b52a4abc0f9156d6a54cca3ef4 +size 648956 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..c3b3c3f907 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fb1a4f5121f151ca4d1d8230a476dd5a54b689c192281fc1bc76eb3b5c9bf98e +size 682298 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..dd9e8eb2d5 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:378a1b9244615d04cb210132e3d6e5f031008bf368e7ceb06db1d95818066e18 +size 680726 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..10f0fed1d1 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f1c90c38437294800dbc182405fd30156e6c7f69024ff379b2a0e2fcb01c2151 +size 585601 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64PersistentSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..ea3b184bb9 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64PersistentSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c93ef3461d2343b0bf15ea5f4ce928ddf91cf4e31c378be42c41c523ded26bbc +size 671400 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..57f929dfa1 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fa00c8bbd2677454d6971049f5bb5a3f1e49a834186c89cda58317e4833406e1 +size 573463 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 3f87acaad6..bc53d2e2ca 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5b9765b01a0dba4ffa85e29b1f28f522420705ffe4caf61cc863cfaf87a4ca25 +oid sha256:121f37599b3facfd228e65ed65ff3f7230e6bd2b8d1074a56200db5cdb642442 size 780456 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin.cpp index 9ffe47a51b..97d22cdbe9 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:63fcaa8f6ad78f26d360387b6ce885e2b0d8bd1473351a0cd0fd5165ae64d63b +oid sha256:29db384ce321efa15724140a81195570c55d029a2bc074fc30cc5ab457d94491 size 808090 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 5af7c53ba6..a9cb7496ed 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d994cf184a7f8b61ca19147bd9c32fd77914b7d522cac50309dd60c4ed4c5bf4 +oid sha256:fb9179a1e1cfae0377fb2fe7ccafa71b7e332538209aa44655786b4875bc763c size 749916 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..d4d42855a3 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:716816d2d15edb7626cfca4368373efaa2ba88ae4a990415c2485513f3364bf9 +size 779984 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..27f3261cf1 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ca0b307977aedb1cc50dbb1f291708a79a77867ecef6969ae59953a4593d0029 +size 757388 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..16ebcd5c68 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4e960223a767c0c7c57c2429646d761171da45639c368cb1bda4ad93cfe1dcf5 +size 735878 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..88076746ce --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:66a4b02c6eb81dd6be8b4a25874d7a010f3bff13016abe49c39f859ca1666ba9 +size 715896 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvGmemSepVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvGmemSepVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..3692e3f09b --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvGmemSepVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cb4c706b48e9d6c05ace138219bd083ad689d88aa956cb304f158308f7d632b2 +size 822270 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..c6513257b4 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a3161fa440c911bd4e738f6c1bdf57700082ba372ce851f25d46c931c6917b9a +size 754324 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..fdb49fe491 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cc15c78acb404a76c5774ce37aa70e22e1d6d3e74ac18678a93ac8b03f4520a3 +size 732518 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..561d0f888f --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4d1004a7478f31ea67dfc6d0704fd3a934e304ef64a2e8c1bbef5059904c4a33 +size 721220 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..954213ef67 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:afc3c51e4fa55635e2b3c1f740575730120a8b8ca9f0d308219f7d78843d7353 +size 700400 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index a2160b8e12..7350854e29 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4392146913d499894e71d27aa000ad9fcdd65034510b7a1ee833b2928e7b75dc +oid sha256:69f2cac73b258fc2e316594451f1092609bdc887203db6cc5eff0c5b9893fe82 size 990544 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp index ba0b18f494..fb48857b1a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9f7366280e3bd5671eca399a2164222c85dce03111577457b29a307813c15e9b +oid sha256:2974f6b074aa83c8e3ccb75351fbb8491f2e2a55f9a590d3d182d9464165a6be size 942344 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp index b227bb7432..315d5af8f3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6a2b8e6a1040dc58eef2283ff6eb8eb46dc13621b943202ad3f5658cd2fbffc2 +oid sha256:b49f2813422f73dc811735f836b58811fec370b064e30b012b86426f6d885967 size 847280 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index e87948815f..4b03bd59a5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:73c720bf83429270d102cf04403627fbc6d5d3c77ec5df2ae57cb9352bc16dee +oid sha256:0e6c17addaa79c3a6ea1d42fb9db686bffb43b3ba6f630abc592016715443afe size 935092 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp index f17f64fde5..97e6307213 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e0cfcd0edc4eec3f831dad3bb38fb773cd992cea4b38974039cba77773ea8bf0 +oid sha256:5dd0d0433f90cc25c27f2762de7df57cafd0312411daecd3d50db5c79d47e719 size 890196 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..53bab3faae --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b395eb57ee389ebd8f570f4a53230d5c208bd886a1171747b679b43d69f23c31 +size 775968 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..cc3e01d337 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:57e786aeee4d381d5869806f4b93165c8166512fb5c6adb9b0c0d68e817ff28f +size 673298 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64PersistentSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..4c485c4c0b --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64PersistentSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0f62b9cd9e7dedc92add2799ebfc8a5e79d3f1681105191cd49dbb0855c56daf +size 755246 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..7fa891f5da --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3bec438c01ec97e9738870bd4b2a68d32294933940b2e677bd92dd70061788cb +size 649912 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..bd66ad2d9b --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c15323b034a255abe5efa5cd23c6e3d12fef0ae1fb2020d9b8bbf7d8598b7327 +size 878926 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..98e78d6ee8 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:94a0b5cf8b7394f49e0750f70f102107dc73cbe48f8b9870051d1db0fd5234c8 +size 787306 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..c0210a08cd --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1cebd791f0d263ce69c3dc4586744fa961558844b1ae8fcc111153484c87892f +size 732256 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..0acdcc2a71 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f22f5dd1b3d223298a26b25c79d81685b912a6124cf929514f75cd28758a0481 +size 637232 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64PersistentSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..fc9f4a0b34 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64PersistentSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8c8883d192375444a84f11b1857da61608e3758099095466ec81771b04185b34 +size 713410 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..39eabc5d25 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bf07b501c98498b9451829c42ff23a29d4ff306f34e75ff4b089741c09b1a32a +size 617249 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 2d76743336..e2670c9736 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:609681a7455e6d8aae0d8cf3417963c0874ceaa35b5a790d6e31b60f6ea3a987 +oid sha256:15c461449b97ab6ee4bc90aa325f44a09c7a57e62b630719469e4cfbe4fcf994 size 854088 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 1a0910ad48..83890adeb0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8624eb429dde0b59d188417088e9d9bb740dc6ab0b798c53ac3158142ebccf6d +oid sha256:506ce0185d96235b6a9908a219b12869982a7cc0600dfba07d700041d4e3dd9a size 819108 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 77c7603104..40a52e9c88 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f92f4e7a87ee44a008fdfebbe114f446b1767d50e341582016d2cd5e21a750eb +oid sha256:7a047a6284d9d4c36996003d05bf1b7df142a48742b9bc780e0d2481085fb264 size 709460 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index a87563c974..e88321b4cc 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:149f17d3c53f96d6ce47fcfea7fd859e1f4b648103c1154cfcd0bc6d6437bb47 +oid sha256:49626291cf42222be79b7cceeeea882f2c6cefb8a55fd388d8ba5fdf011f59ba size 733608 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp index 860f6b4297..40d134ccf8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4828476e474eb039a294c87dba4179e4f2ab3cdac415206d3694dcdd6f04b846 +oid sha256:1583a379b1e84abb2a974fef3a5299085927b5d4dbd2aa5b43eae79aaef9c6ed size 711432 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index cb4d72b546..b90fb8c1bf 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:91a4d76e8169f2c1a3ff09debe135b30b0141c2ce4f1139733baa4ccd4ee2688 +oid sha256:7e47bd3d2263569bd122e4bfb87b1bbf0fa57ff00d5f6ee9ad7f5192329f20f1 size 730152 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 49f224d9b2..fcda5e778d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f7ef819659f94bdde5b264b2fa26f558a64652fc13e7042988f211fcdf740f51 +oid sha256:639a29d1d147196ea26f061a67e208becbc7e1f9037c96f1367de4560bf358b6 size 777868 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index f6d213d87e..f962f936e6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:93920182b464363cdc280cf436b81e525ee24bd626a1c7c703df7606bd30d72b +oid sha256:d2ddc8597fd0273070e17aec1b49285d46aaa9dba1aa0ce969b2d097862bd56a size 801818 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 8d016a86a2..26a0f721cd 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:98c04b5d7b5d0b0ca4f71130701011b475943e6d70d9097358342a7f612b8192 +oid sha256:e0684539199be9000b25f3f9d5f6700abe267c63bdd70f58846e27de8f26bfbb size 792392 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 1abc13c4b8..5286df9278 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:accf3760cc3c9e0f9c0e6c9b3274d9027144c52e30b0ce9ffc3331d955ff2d2b +oid sha256:b94a44337559e881d532e639619301faa7de915d8139f91c36610c5502d44878 size 817180 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 4a9f58fbcb..6f1fa5f925 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:aa124c91b210e721c55f99e48bf0dd380d6105ded4693842e7a267ca823b85f7 +oid sha256:39c312948f313f89d10a510c03efc443f2771ce81b641328309d9eb05d062eb0 size 623402 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp index 3d6cf85db4..9197e3bd83 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:50b2b512f8b09e8905cbf2b969d4468debf112996c04b3c55ad4a31a083c0b2e +oid sha256:7eee52b72ed3757849f2064fc8fab93e162f4cef7f7e4744a5f3247b293333e8 size 588991 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp index 6a54bb382c..c14f1834c6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d4d3a04a5328f3254511bd986c94819bfdb1e309ef3533fa5c1cc7bc2483c9ac +oid sha256:bb674265dda3c180adf19f20c1e2a2190f683e6b6bedefffebd89de5f4575112 size 549967 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp index bffca7aff9..ca2d58547d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c144078bbc222b41321bde6cda26f7235fdf11f5528e0b653a9bd0ec3749593e +oid sha256:c191bad88a1467798e7d65f7c41e55f6a93b92fee462be0cbeee85a159777360 size 576307 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp index d2208cab1a..63831615c2 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:aad547b193f188e5dce6c1268d4da5801fe796a6cb9afeb9e7d27c173fff6d0b +oid sha256:00cfbd957b69a5347972c07c932fb57bf4efb8b8fe7114cef34390272bc6fadf size 542067 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 555da6a3b4..0f2e0c42e5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:382cdda47aa0867468686f199a489c88cabc88270e3278b559fe8edd091ac6d6 +oid sha256:4fa5a4491a7e97215f855d8f96c74d9ad822afd2f8055b6be7f0835a48991270 size 790268 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index b3d1ac8846..17482764e8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:264b9d9c371c5b5a4c994290536d7454245ad3603eb5e222cc3221e10c66ec3b +oid sha256:028a6f6062884c8358d0e4fb97d0a6096621de501af0439a415387de15ae1c5c size 673112 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index ed34d309a2..acb86cc039 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f5c879d149245ed63b3e8edf97151246bf9be37791dc29cfbdb1adc81c4b9c91 +oid sha256:4cd9bc78ed80b20ee342929e44019560a36d145d497c652ee87ea71c925248ad size 617707 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 3155c49291..8d0b16b30a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a2842a3035ae21d391d9d218e6274a9f6de8bda0abee935b724954be2dacf088 +oid sha256:a9f7c3b64601873adae87dd3b37db7bfb57ebec7e1c4ef20cd9fb0b197be1a73 size 812294 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 5415a181db..f3d700a11d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:003724e62a7e4e4289ff99ef8d4251ce106121d183b2f55b7319dd90cec9d488 +oid sha256:1e6eef6792527ac1021d8559f5c890d98a3185bcc99a46be42c3b15ff4ab6b8d size 607915 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp index cce2a8c3e0..28370bf61c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:662abeb3018fd5c21e19a459cf48af9e49e3a37b84f7e2eb59fd2c8174f7b4f8 +oid sha256:f6e508b3042884fd1e4a3eec1fc549fd6cae2f71ce9f65b458f46fe9b188072d size 501495 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 7b2788d8c8..47ecd0faea 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f12aff025fef7549ed436a5e09636bbef67b03cfb2cf413aba906e8b8f4c9ccc +oid sha256:e10cbb7e390e3ea552e3d955f190bfe0cba8069a54a27607cd7423bf49ba2b75 size 568595 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp index 4084c16ff2..12aaa3ac92 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2c8191a24d6386e6cf20f3089e8b271ec95afcb7b631bda6e14ef91ab2783562 +oid sha256:1337057dc8702a9c1ca0bbf05d78965c07ed4031ad956824059af1c152e1dea5 size 466467 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp index 0119488553..652bcb54e0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:bcff54c4820c7029141895868f9adc878d862fdcefe34d4acb68c256d8208e82 +oid sha256:40285e4b91bf5e5422080e423f38457a1e4b70a6ef4d82e1293d8a104484dbf9 size 614433 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp index 4c82f5f0ba..4c531dbe86 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e49368661fe5aa1370d9857451a035aaee988e0b956fdd3799637f89a255908f +oid sha256:ad5c5f422a30f271c20b96753a1d248e4bdc96d96abafeae75cc3ef3ba668c45 size 575457 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp index 8707c9dbf8..b70d2f8bcb 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5c1fa029a591d321923881b41918700a6d84999c27d27652d631dbd2b7505653 +oid sha256:63929617775b31291f638ac86b3495ade512c12b9aafe78d11b7f4bb64629a0c size 601747 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp index 9e8d47e4e2..6e42cc3757 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c7bf1dc86240dd9b0568430b3c35d73809fa45b6dfabfcac28b47cc8939ce80d +oid sha256:dd4c79466a652b56146a6db712aaa4a293c646225afec518207c718a37213545 size 565979 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 68cc94fc2a..ded5842a3b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2af2a5136608880a186b9a8bddf4399ac88e654a5606671a7db62f051cbc7955 +oid sha256:7c7f4b3a824bbef24a14da52a3608d1b49708e81f5c25f868d6e2433b6a267f9 size 863512 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 249c3b2010..b605b8e8de 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:91b4d3d1c18860c6cd124fdd53f1398ee1096301297437c8e8604987f8fb28d5 +oid sha256:ace7e82dba210c7763f1e4f76c87dce20ca24fd4d86b288604f35284388ebe3d size 694062 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index db7b08a932..033aa40056 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d3105d68576060af095166c08f5bb3d39579d2ff08ab0d870fb8b24768fbfb30 +oid sha256:e174cad60bf3109d8983af4afec16287f9839e12770a5ecc8b6632fcfe2af88f size 647144 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 3d9d4867bc..a46a7300e9 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:904e4de441c4ec403149b4ba143f0eb67932c260878fced9d3f3f404480d403d +oid sha256:a6d4e43c640064fec17e359cbfbbb73e9b2cce2dc364b3a8aae547f0f91768bc size 889388 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp index b4b6557553..979aad2bde 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e1440368b25584255fc6f34ba1beedf803d3cd4fe91157e87125d1868d835b50 +oid sha256:486b9108007433c91fe61223898d3d37bd88f2b3728ff89d9fbfe5128ebd555e size 631778 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp index de0586e914..cf08d57452 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4a8b485f51f484e18aeee292182a433d832ef28bd4e527267d0d0e11a6752eb7 +oid sha256:a0ed4df954dc66547ef74eeafbe2fe579c8a341e373be8d0561746550b77fded size 525209 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 12c2e9fb0d..24efed96d8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:47fc8f08386b261023398d611310a5d4160f7b4d0a6c0b7d3f5dbb3915626836 +oid sha256:6bc2019b42c308c22d0bc16fa24c14a5083dce21925ebd052110533f60c89a27 size 591273 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp index 2800b28991..6fbeb27f67 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e2a8a61a002e5b5aa9102bd50c2cba2344c2bf7a6ab4b2d273a7c9d9157371ed +oid sha256:e13ade8bf9d5d5dd820f6216593d271b459c1743791f6065bfcf560e12f403ba size 489391 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index b550e928f2..0b21524f1a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:34ce0d92a6d18fb782673098e1c3589690b907e73d185a5164afcf5104dd5769 +oid sha256:a90e14c3f899306e2bcada888793a635b1376799e9010df6910e072f925deb99 size 673990 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..32dff82c69 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b1d97beed679b197c3d2b4df7522c434e45e3926b86473d3f50e9b07e6ab065d +size 689406 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..3475e935a7 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:460b0f408b3a225385e1faa620806ee16df927a9069fd1f26d2f934f4c2b2476 +size 600153 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp index bf9942cb37..67e1518836 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2b16c3a0f97b955f1149c7c02036fe55f3218c44850598d6d4503fc17a2a1ead +oid sha256:80e4306ed169e147bbefbf43947c86b9daa1fc3b8ebb26654ccc9347ac99575c size 671224 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..405a616a7b --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7538c107f54f13ea4bc019da868c876d4f38f1b04bebd09c9ae765094d940abb +size 688564 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..5d9a4c4748 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4d1d7e27da7a20a569db88f7a687b1bd0bf94f477259a048b0bcc180b1b03f13 +size 602913 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 616d0abe56..5a5f69dd6b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9f3adbaeafcf9b70e0c64ff6bda2f8b19e0a856f67365c5050797fe03bb8bbb7 +oid sha256:7a510096975e01f83bc270394fc8b6d7c9c2843b0abcdda52e074a6de61d1001 size 739486 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..597e8b3f50 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:629bbe03f52feb11d2d23134cc6282dc4ae1e271a4f6359ee829d05439d78cac +size 757666 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..071b03031f --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f82e0b30a68049793e3db61bc28b54259e06699185f2b3f0d6d046eb24f9bf2c +size 670288 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index e49e798e45..1e21c72909 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:940b0d427c34ba33c9fecae0be7a8a3ea2dbe7efeccbe4220be7eddd8e748dbb +oid sha256:be55e13d6498a4ff3261ca978269ee9477ae0966dba6807289b80d03fd3107ff size 761508 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..defbccbbf3 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2f91d37f8eef4577aa44fff22c317741f013cbf2b9b63c8e3ce1305ef0236e73 +size 778060 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..d1cbe92f3a --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4ef4986ad977ebe33eddcccccaa1e7a9e0d8566315703ab0d4be2ed51ecf2aac +size 688856 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..4b667b2afe --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c26375906206cb17ef6fd1c5dc3ac23b8ab69fdbe48fff2918e6474b73f49ada +size 647206 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..cea4482c5c --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7f218d2a4b18d6ef8f6f161f43a65bcd9ade689374dbc5e39c8f4609a66ee74c +size 611683 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..d502065819 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ddeabe7c618928238ffe99de34c82b7c3ed801de34b2feb586ca75ebfa55a1f8 +size 630278 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..432e3f8b49 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f41917ad263315e35d74fb830551cd943bd11c8688523ced379a72d87de7e377 +size 600379 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 1fd933fb63..db5c5e8f64 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e681b91dc1fb22a538ba5f170b5bb50e0a914d0513cef1f48fb7496da45a4c68 +oid sha256:f0d14460c7ed790b4904662ce43cd7533c63b8647897f9432fc9380d79982588 size 760816 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 2aceb23fff..f835ca34e2 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3465baa5821c965f95815da01b34f924353cce8a8d69988be2385704e2c8fd7d +oid sha256:ccdef537ccd14075bc3d1a54b532ac227d727d1de451de61eb96bb3a4a4349cf size 784506 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 22c2ae0994..a6af7a3ffa 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:44a8948950b57472227c087e7020b767a8ec086cf3b1751ba8d85d8cd29dae81 +oid sha256:978342e6a7f6eeec71a1227f0cd4af8ea525a87348c9965809bc0690316b000d size 719138 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..a4b6c3f031 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d6ae203638b36c0e61fe23f129ac494246d4cc3516194321086baabba6a19ae1 +size 782004 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..e9daa39eca --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4631ebbee9118d496b9e21040f7994b067f18fa1cf80e5c803808954769d3483 +size 693886 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..94d436928b --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3ebddf158eff7258bcea3e3b9b2f6db3bd262f467dc0a2212bff59f443489018 +size 674714 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..a917927093 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9eb57068ce4e271a411eb406475cede0c76158db33f554fe62c351d2a086fc39 +size 568885 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..4913c914e6 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bfaa39c1f1731c20c68e27b646e64b306fef882151af53970c6a85cd56ae9fc6 +size 627302 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..e11b7a6dfe --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:437dfe7541d6c622e2036ae8c39235be329b1a3836c9bf061f832b434cbbb056 +size 525025 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..1dae766af9 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7987322f77764ab070fda4f858c5e09575dbd95fe0e1654dc301f2411231e064 +size 673436 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..3a0b4cfd00 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ca2075c8734df35651f0d5139b3671fc6863c5e1026b33e43317b7af56983a6e +size 637124 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..2aa54e32cd --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8f0eac1f3e99dc1ccaa46eb61ef0eeb4043ed4e7f01b6a4841c1b6dc973788ec +size 655720 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..849c6d626d --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3422967398f2b5cb3cd27b8477e78d6b32d457493730ba07702419428623bfe3 +size 626018 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index ff171f6bc3..0573387c31 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c9d62b1203c105a80bf0ce32ce58f007fb92423c48547868be58baf1667aa087 +oid sha256:1ac5806463a55ba2cceb9d627d8e291dc0f95abc4c9c3e92aa09a6ce5ed5e56b size 828782 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 00cafc5838..d29871c039 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:cded8b4ef60b6a16dfa5fbeda3151643cfbcf43a9e216875aee8c8259be38670 +oid sha256:4ad38cde859162d812b7bae84f8a3e878628f4fd0c11b3cc36e16a29c6f07019 size 806246 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index ad3acded42..67a6b2cfa6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a18924481e898ebc8a62264e0eaba89c74d4501f4bc572d570cb2ae774c825d1 +oid sha256:f0b1e7219c00be212519cc37a59ef7b729253a7a2ab63f6811cf55aa9a2a7059 size 748574 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..4a923f8dc9 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7fb7faffa1fc8549ce1194cad3a2cd72ce064a7dec4f41e48bfaa8ab15089fcc +size 847010 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..b0a25df36a --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0b3d61377fa169fffcdb0cd6bb95728ec2187db0aa52abdcb1e89f40646a6c37 +size 760816 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..f1e303754c --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8ad322c248b12b873d6e8b71ee99b9761231f784c25723c4a0657b432b285bd4 +size 699020 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..44db996346 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:739fd051fb001097475694e4f75a0c7b54a4bb03527eecd3feaabc2e56b527da +size 590429 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..67eea7c030 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5b41823b2cdd104ea730d7aeedb690996deda3c341b5abd59851f6e0c5d67b0e +size 649586 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..8e57afa01e --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:76dd36f7b8f6bb0feb3162f955907edb3484e370d4d32a9af7b143b39783eeb0 +size 544891 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index d8b4fb50d9..c4075d6257 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:509f2e7be1ff3e5e51bce28e5e8d32e22612436afcb66d7d24013cbd880795c6 +oid sha256:152ac2a97e33c5aa3aabeb44071f51d12bb6aa1405d94f46c97806ed6f19e31e size 708866 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 2f98eb5d15..17a9e5a1c3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8c985a4c6650f8aaf4c7496cfa618458a4386517668b3b63c7270e98a9331d8a +oid sha256:ad80efe9a3686c53f3d367fca7325ede6d95633e46ce2dba70d74892c4b02c2a size 735480 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp index 9770429c31..aba2a8e108 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1125ea1ac7d7673826ba3b0e09b4abd80a55b8f65ac5ce5fb77d10fa34706951 +oid sha256:cafc4674e41e1c6b80e38e848f38db3905f0e426ceebda9f37a0178dbff92af6 size 715426 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index bf7418f5d3..bea3fedd73 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7bcac24a5a4d6caabec3886a6b7d2acfcc5336bb2245c6b49f595167ffd45dae +oid sha256:f7e42c61022dddd0936dffac8b47e60986746ef0bc769f42c56086e2c379e305 size 741448 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 1bb8194558..3211d9f73e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6fb9e6bd9767a08cfa63b81c790cd838c1a14408275ca5838a6cc72ea86d8ac0 +oid sha256:d6051f24710c444aed0f854cb13fd204bcdeab8cbd68c7b9ce0aca06910c0968 size 778852 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 610a912894..911fa8f994 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:01742db5387d506117d748eadbff4056ee926336f2c7b4606a850514e7c45e01 +oid sha256:c24b7aecb3cda09034df2f8af52b84f9f8655c8dfd3df500a18bcd214f4320f0 size 804480 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 23c32d6a78..d6d6d8a9b6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:62d3078bf8a580a80552792e39d15ac8228b568d165f6b673344b9c269d0bd6e +oid sha256:99b72493e9044f200a58af07f580b2cc1b93bab189aedc5b6ea4a9e97e3d40c4 size 794018 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index e2e3453722..9c9da206cb 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a10273cbb1a690b738ae8f5edd0e3c14d341e8e269aae590175ba84fef399ef0 +oid sha256:5a8c3b2137ce0f36b291d094d8aa6c577b7fa7f6833c1abb87e292b8b8a8770c size 819694 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 3d70956fee..9de4dd3c7b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:364745343abea41e83e8a6a477503923c1d254a75e3bc471b3be255c384fcdcd +oid sha256:6e680afb56c165d886d9ae01f0c0d34c8a249ca2ac42d53e9599eb16e600dab4 size 623004 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp index 95e21a08a5..a48f468bfe 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e3a0e0db9243044ec6729c3d4a2111587b7b1b60edf89028b21c415727104407 +oid sha256:867ba148a1bcf1b17a3fbab4e01e30983a7229de6c1a80be397b3d7244e99790 size 576261 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp index 4d106f404f..43aa477202 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:776d76a32b66d9796ea3dd084db699b517210f3df28446b6ec41fafe5e67b823 +oid sha256:9ffb0d5af0f6ae7d429997e9b2a4e24a78c60da15c38f95965e9b2da95d245b8 size 555589 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp index 500c101969..2d0d2e47dd 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6e201d9e41fc79a1aa27450154b783bd69dbe575828edec0663f47dca5837b26 +oid sha256:9c5c60f6d8fd9a62b04dd7c09008281c9778eb7138efa2dc258a6c2791dafff1 size 566537 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp index d68377b306..75786e27e6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3e7e955413fcda52be21f4f03ed0b9e2113d5b8f0908e42f83b745a2ae6c0911 +oid sha256:aaeeed4aeb9898d07e0d5631b5f0487701f26995039a21f8f5e67dc74e93276e size 546949 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp index af1ac46c23..28eb02d1fb 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ff3fc85e9d7acbddbc6a85c19dbdf2d8dde5b884a4e7a33d622dce08d822b39a +oid sha256:19eececaf4463a9396d8d8384cafb292f4297d3f7a337e8e86ee23c821fa81c0 size 799146 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 3202800748..7561db5b76 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:54043918777e0f861044c8b25b4139e960ba3f1138abe3886aad52ad4e062597 +oid sha256:024c8afaaba5a2417186925b2c18874f2ca8d0b5b70ab5ff55bbcdd817a3704c size 652586 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 8cb97a676c..f9784850c8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5e05c7bdd158a7db461bbcfaf8cdb63612d41a55ac676a2f801eea08db4665aa +oid sha256:ff47121a1d20f8a057b58640c8bd66bc4e8d4dcdc00f349b28776d1d10e72df1 size 616669 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index f8796ca90b..64629c68c6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:cb7588560a14cf77b9df3c2d509dd86da36ffdead06a85b93718a4c4640d6760 +oid sha256:b7be4072957217723c6175345d9f5beb158a2620c4183f757d8a89923bc5f57e size 825168 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp index ddf9d66fd1..2bcd6f3f63 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:149d7a2c78a043f3324bf135ba2b4ca909f234a15ce1c83e5e642eb782c98935 +oid sha256:2ab12a0801ddbe5b1fd5acb780c1f9d04e8d47fdbb3cd59c8ff0c5d384e53a3d size 598343 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp index 9982bebc85..6b5987d67e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:682f3ade8fd5570de881a2781a989697834406a2e0af24e9ecbe985f9fe7c8b7 +oid sha256:682bc49c0d01c99c619d284abf7b4deec5e93a0c04814e16871854fe39ee9c86 size 494685 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 4afd8fed05..0c0aefc5f2 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3fd038409a47870238a17220ea8884383a2b789642b9967270fa8bf5e7e4ccb0 +oid sha256:28fc440a05f4ce0f684bea04d8def6832c634a94006d976429074b6c3e81ba02 size 571751 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp index 35f07bd3c9..8b9a4b0300 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b98907f968cf9481f5e9875a9579160eefef350d68e4ac54fce823ff5b136197 +oid sha256:e5862ab8ff02279d384b03bdf9aad10773c8de1351ca3ddbc700baf4995f43e9 size 471991 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp index bacbe37af3..c08f381af3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:af83a86a1278f1eeae90751ef30c046c3f5769e6f455067539aaf18c3ff020fb +oid sha256:f58db6e1a226e4d2019c435844b5ec89452929a17d24ca3c06d2147aa0c0e613 size 601751 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp index d777e3014e..ba290c2604 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0bae74e0d8cf8c4fc4b891c693e9529a49d78d3c676cc120b0f06f8ea7930d6f +oid sha256:8a8fe6d698ef5c84e8c7d87a5b08c30b148b2983d96e384a3b72e1bb1da7460b size 580141 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp index bdfaff9f71..17555916c9 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d0ca102c05790248e2c87ae72e21640eac3c2886e0aca6ef469efb5f60adda21 +oid sha256:d8cfda197c44f0b907e99cbea1f45150bf60dcc6b8441e9f5680a5f91eb37bad size 592027 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp index d47c92ddca..2b5641c2b4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d17e5eb815ad3f0ade71093619a8f2cc1a084cd652a5acdf7709cde383df1673 +oid sha256:662a4b86ae9b50147338ed5ed363a1bfdb5ad0282bd96d300029f15eb66cdab6 size 571601 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 2cfb673890..45e303a533 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2972ce2e1ef344c7f8ca185618bfefe032d085b43de23293e9af798e4b375e69 +oid sha256:eca09e562805fce80c8c5364632793346a2776060779556ff329661ee2b945d3 size 865928 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 1364d39d2d..0af92b8b63 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6dc9f678a8d3b3ca07ae2f0ba0e4d331bad1e8a2ab372c01a3a76944c271ba87 +oid sha256:a4f14fc1ee33f3749d041c2f249ab3eac86eebf0e4eff09b508a36adf9799034 size 672748 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 4e24fec1f7..05b59c709b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4d8fe0aff5b4ece4170d6740347964e7622d0979a3db0c4c139866aaecb12b91 +oid sha256:490c54a839225b41d22411d61b879a4467c3fd906b57f5b02d2add0101b4c155 size 645170 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index e7a323500f..946454cd7c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1c5f57eff5faf709f48e5cb88eb3b39924b1654d6b1c176c741348af78d161bb +oid sha256:dd55112d97de2934f804414503288db75598fc84a67935bc8f4f0a70f4ad8404 size 891902 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 622c9a27d1..19d9493bf5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fa7ea15445063dc2eeaa1485469194f18fbcd545058f8a4edae6cdd2af17e6f8 +oid sha256:7d5e20c1c2a7bd46983bec8859b0d14e64ec1f681395f59892fa8c978d910368 size 622996 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp index da4795e10f..149dd2686f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:498d3b675e001271bbeb0e647e3ceebee22c21e3227ead3ca4ae05397666d714 +oid sha256:f0f9745f271af9aa344a3d9430b5269e0d83fe1d91074ecf5e106515719a0b43 size 518399 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp index e1f4190095..42dd782760 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7028aed6656323a27b8d7868278555d94fcd796845b1b5214f8ee11dc6b9e40b +oid sha256:2b24d77c3af0999968d084d9a28d4df707e9ed34983f10aae3d906c70acabf6e size 594527 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp index 3fd4eae3fe..6da735ff1b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:32fd3832498d5c16533260b3031d2f5be247904aeb6aacef010c13dc92d99faa +oid sha256:8c273ef202efd0077c78d11fc63f3d825a1e36d61e0859266b5f5aed8ad99668 size 493337 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index e2d92915d4..48bef03abd 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:68be37e27dc3160544c92f85312bf2b3a1488a17415231aadc0d6f4bc820b39b +oid sha256:682896b82ba7185b22d1a6e80fb231829c0001297e6b9a00660d20f402445ae0 size 722286 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index b62e1d7792..e32cbba26c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:336b7248c5296a0e48115f14f1834b9f5f62d76ade0fa78a58edcc78ecb9c90d +oid sha256:fbd84bd55165e234a659a935a73a1460989cdd692e8a623a6c4dd143cfd92ae0 size 746434 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp index f1cf08a2f8..0f650a4c73 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:bc04679983bf6014651c90522c9c14b334e9252e4369bf9c600b9003b60a8635 +oid sha256:4aee3dd1cbe0bb41c536c41be943b7422d453ed5cd58425341b6546ed93427ed size 728944 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index b1172d6182..0496bf7e0f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8dc26180a5327727648825923a169f35e79989c52de7be574ff67d59cad3da50 +oid sha256:8469a53a6584974f05dbebb9346c23b83d465cc405c3d6721c812e751dd3cac2 size 745890 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 9348eaecd2..5f45f35223 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5ca6ee60acf8be3f6848a84b17e80a101b470230988c345f147af36c05920169 +oid sha256:d13635a1d818f0859281c2f96f237849bf1d87e912beb64d4448600d062882e3 size 790646 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 582fea30b1..cf85786467 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:909dbd251e42e9db839cb2197101deeb84d90d1f98ae8a4abaf5dc8da56eb69c +oid sha256:8c2af8c8213d91276b31e86f5530c6a48a6d4325496ed295694f4d381fb2446b size 815386 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 18a5f3ca88..6c476edec3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:581f493f0e4dffce9eee09e080c0210edc0f315bff95bfc69df5080eb236f5e4 +oid sha256:eb86de61c579ba776205b893359be245c38f650b7eb3decd17244ed410a7c9c6 size 805958 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp index 8e9990e357..c58b4e85d9 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:88d895b232b548fbabaa868c1f2db6ae5e9c84d555823c3ef0f279b91d0dcda8 +oid sha256:ffda79f64742b4c989ada499966e5c2092731901be9f1fdcf738c71b4dbc55f5 size 779676 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 428188413a..413106b161 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3614bca53dd21e5f18395ac6d06f042bcc07ef01c22abef0e308a4fa3ca5c7d8 +oid sha256:f8463b8784ca19607b4593a006663376729bd6ec82fc4babbebed6000f74a191 size 667490 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp index 89ca59026e..353fefc4a9 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6a83d278174ef327b7616d3ce3a3eb9ccbb7eaf821be2db42039f08f085a146e +oid sha256:f5249c5d7e8c2711a110112a8c98d126ec6659b079998cad52350bc908e26da3 size 741242 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp index f8ded10136..ed35ea4dac 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a95a9f400e29961a75f411f90fcf687b991b85d083743359f5de27cc0b13ef8f +oid sha256:c59e23a91a8218aee087df14f521c0f6b100f164e9681bf769ff9990f6cad689 size 674298 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index 93433584fd..27a4540d2c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:00c03c2854b10db037bc6268323bc188b3cdf481196a40cb319eba7891d0a4ca +oid sha256:0ab9e1aeb419b82dae68f85a77ac348294fdc3e1e42cc2640b134181101d3200 size 587313 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index c0da8be69c..3ceb1b0b46 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f2e9bd1a5760686538c3483362bbf5cf314133490ecee4052daa579fd6ef2b4b +oid sha256:85634a91e02030195c351577c7e08350ca2a45ed397b5a95c4bef45c35179318 size 619634 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 07e66cbac0..baa6610529 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:33a5f834194c63f40138db1662a44936ca86a6c735a0a499ef3b59ed8a298689 +oid sha256:23809c5b752cff00cf8d7b86cb84c2c1c826d1128f20eba69eba7ccf745bc107 size 830748 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp index f044bab501..6161360d4e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:98abc341024f974b6fee50c0a5fe3885c6585309e3e3ea6876f18b0c5f4f52a5 +oid sha256:2d6d46f14d6cda6e6ead4ba219e0f30918e07cd1470b2b0d7c53ab81f866335c size 761422 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index d3fcad20fe..9cad9b611a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c92107f05d875255cd75b5673d0cc317778e456e39622967bcf864d81c2b559a +oid sha256:c5bb23f7461aa74021742d063c00bea16756d96eda295a6bffea039c68e7fb64 size 676462 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp index d7bba3b2f0..490a222511 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9c7217ffb2c265a2c07a9ae993def79b72cfac3d886e6dd594e611514a0815f3 +oid sha256:7e38fdb1e82c9c625185b259e943296e7d48f5b9517d4bb47101709e1498cbd7 size 788678 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp index f3eb1907d7..bb45dc989f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0b9d11f04a104c0e8cddc404b82b69fb84a69f1776f830e5f890d3795b703e57 +oid sha256:968f355fb9a7ce2cb6edee8d483357258bae7333eb57fdca89c2bbd3bac844fb size 702040 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp index 8bbc88aca5..a6095ee422 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a67bbf6243e8b8be4fb49dade5328c9c7419f5c33c8869d109f59a07f4cf1288 +oid sha256:7bb19fc4b9f0b9dcae9a8ac7d38611c04cbbedf480549ee87c67bf4a2b7e575c size 579717 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp index 622a111f77..dd9137e692 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e132ee4ca6c2346413e53abc11095fa6a3ecf37eef5937de1b429f6f1cdd340e +oid sha256:44322327f1ccfe433506e9d353dcf5940c9cf89dcd9e4fa49233ae97da7061bd size 540841 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp index a1ccfcef4f..8d7357f31b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:92a21dcf00a0d1e5e669c3ef73313573b1e2c518a629ee5d18063f8350004dcd +oid sha256:7db2093a43882e5ebf37079ed2181dc26d447cfb4cd8ca9f59d5d0955c1bfe5d size 565651 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp index 07e471e3e2..a700a2d4a0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:37d2a1c8be0fc8d2b0f5576a78abf445ffd486d4bc06fb757fd554b4d4b30fa8 +oid sha256:da9cbe23434088c0353491046d32144065539c1302d07f751043fd2d2841c183 size 532793 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 662a058842..847931a8ad 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:67f449e4fd7e966c55bb198e24f5021455beaaa7d778610b651b68f6b11b3fe0 +oid sha256:32daf25f16be4864f0042a6139aae7a2ee8c75b523f095d67629432797b20e5b size 807830 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index a9dc767914..0bf1c14e51 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:35fb58be1144a7a4ed377f0ad74d0dac306464d51bb6853aa8ec94641f8b9cf6 +oid sha256:93c1f2553af610a30b18b972caecb669ad10df584b5c464785a1df7ec5b1c7b4 size 828032 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 858f2b50f9..d50ae7fc04 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ce3faf5b86588e80de004fe4963844799ab466c67397f2e21e7b1388307680c0 +oid sha256:20b1b7367b0acfb1a4d77d565f366ae93b91be569f13cbe049af9fb236e69d24 size 582657 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp index 205728c2fc..7341904a44 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d8f4bde136aa3e8bfcc7a42fbca7e4dc1f5a75a4fbd219b2ea8ed9f10d9f8f4a +oid sha256:a1872f4ebd74e74e2cf9f1cf4e3b238813a4d7168dbda44d85767c54fe266cd8 size 479691 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 17dd6fac3a..348f4597bc 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0d6088e88730796eb6ab30e7e7eb3ac5a16bb6486688e28ca722a96f1ccfd950 +oid sha256:e54ae0cb7b47e4115ccf9fa3b23f4db901de74875fba34a0698eb3a33c81384d size 556607 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp index 51bd08bf70..258529909b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5a73d50fa42c07e26a7c37271993a323c7874788a3aa1df992810fa95860eda6 +oid sha256:2135653e61c1d7f6663952ed2b3d6c43aba29470e9c573978cf38da122f57acf size 456897 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp index cb73cd821c..6342490d8f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0283bf95295aed9b83ca57888df42d9217c046f077495d595ed26a6d6f3f370e +oid sha256:e2ddb3d7c6456c367eb62d1fac45b6ebc20c1ba8ca429bbbd71071c803da0ae4 size 605157 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp index e01a8ba91d..16199c68ab 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:885078406096fd7801d64c4b506a25e2660cb2785b133c386ffb817ad368afc9 +oid sha256:5e2b8ddac441d824da9655f9d8e735f47de96f06d1022784b1630355cb5262da size 565491 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp index 0a461d8e92..29020292a4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c360cfc9a88151bd8bd280dfed2fefdf5878d24150a50a401da8ae49d261dae2 +oid sha256:489ce05f1bdfda166f61af341a980e39b13e488ec3498535b646b2dcc9864b3e size 591091 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp index 0401f51e24..ab8759c3c5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:acd06b0abf00704f8e18f2dc702121728f6eaffc9267b12858e897f7c7965057 +oid sha256:39433219e25fd44a92bc1039d07c1fbdfd7116eff80f32e862339d5746626d90 size 556705 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 0a7672709a..17132efcf0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2f66f73c34c03b0e6a1b0cc696dc3a61a01e397333ff0f142bbf6e6282837050 +oid sha256:ec4cc7f9222e0b7152091320ad7382e8495c47178c9dbfe71b6ae52d1acfdb1f size 877080 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp index 786fccf203..42cab933a0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e8bd6654e71002989c70e26439232376c1c3d755e3f301f1c376358aa70033d7 +oid sha256:a55510a0352bbeaea504e44060bbd7e7ac2591827c974a6c2520e0196ae998eb size 828794 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 722806b981..3f6eb33b18 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:902e274d89913fa71ea62eecad2a3edd96c9cd3cb2a7dd433ae547d37adb5a6d +oid sha256:101537423ecbff3a8b1253a400f9a7f9bdbdf7edf6d7f1ee639989766691e671 size 695986 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp index a17f95a041..55c159ea1e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:da04256648371cf4c0fc3159096c8ff2db82f5efa3cdc07ee37a21aa4f0e7317 +oid sha256:4b838b43b86aef5ab58b79176643330e8384d5b01790a8461c57d66dd0c9376c size 774526 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp index 58a12b764c..8e0a6ccbcf 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d234e59991c558ad8e3fd12fa4ed9783cebe432466dd7c8f03fee7a2fa97e1d2 +oid sha256:cfc5310800d1610205acec43f1c5d999f0900bc8d944324c6a319e371769d93d size 711822 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index 4cb288d207..78e9a985a1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8123dac74dc9972251df2f64c74e7b236db3100ffdf5de1056d7fc81c01295b3 +oid sha256:fab021505ce061b0a1e1857e94d1df6a6c87df304aba02c0461576982eb7845e size 625926 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 8fca414d53..d21a0f5fd3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b2c859a47b33e5b2a3db076832b30f732e84251fefa9a598df092cb15669735e +oid sha256:92da058e900266d60836ccf4047fbd71471baa672198ceac856347af2f6bf230 size 654742 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index ea77fcc53b..30b9ce32fd 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:dc37148b7f0cd0514a9420260f47eac844bd31a27da7f4dc9d57677ecfe5b4e8 +oid sha256:6048aae5f3b3decf219f2b24a473883d053bf1c51a0273d95588b40ed8b72b2f size 902954 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 9aac8534ae..25fd7601b0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b3c60950e09485afacd453863b0a93c612dce2480818299d9678f9a0c401c79a +oid sha256:c5d3dcfab205e70adf139b8494a4431624a8a4b58c6bc0513ea0450dcc47f92a size 607357 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp index 5f704ce836..87fa037531 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3bab59c6fd2193e42074dc261145a1752093cee204c4bd1bf07531059bfc1d4c +oid sha256:8e43277675b938054766f9923303c0e27e5a8b837ac1a8f6a948e4ba9fb61db5 size 503405 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp index e4f9439584..6de6352af4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:600bb3d377d531c3bc85ffa7d78aab87b202ae90919aa3ad43d2ccc4bc2aa746 +oid sha256:2a3cf9d4112941eedc831f1aff4b8286a0564df56ff9be6c612350feb4abd53a size 580075 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp index 8a8dc7547b..23f13f5f57 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1ffa26f02bd5faf0073277833b9e2f3158d4ee3fafee11830c3ad604d5ab91a4 +oid sha256:64957a61ad5e0e5a8ced50c65075d4b94e0b52b4f0b9b8634e31e8680cef9070 size 479031 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 71ac21a56e..3f34ddf586 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:558b96ba65ba3ee929261b027ce0491c6945f61e84fc427a786b1096a2e270bb +oid sha256:7cd4465fd9ff6695ca696bc6188c919031d6e769fcec16c2fab57d2764f924cd size 699494 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..036bf5b4c7 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c0a9aff03ac54d0e97f485b157d5b27fce32b1e660d4029d9887a904a65c6ed7 +size 715700 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..c57655db8f --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8c810fe636a2538b53bda19d2f609c1298de3d7a13258b1b35fd9ace1266cb37 +size 623390 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp index 66ce9b7b32..da6d95e680 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7b2f73b7f89fc4b1e0712444fd3f9fb92d5f99aceeba9c78163e3f629c662ec0 +oid sha256:378d63c861b82430399c3f73814ce1be0a56694ebd4b083207edd8b36ec9c947 size 696730 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..7db281581b --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:95d820b397b4c91d07cad9b51f7055307d5b6efa140f651ec314b44c926026ea +size 714070 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..e4951a7af9 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f2b146c32bd4f47070e9de25154c5cee14bfa526f026d2cfd3c9536b3d073d02 +size 626052 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 076a1449cb..dfa20fd22d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5c3f3bfb0b328453e225504bbecda48dff41ec58c2db072f2a9062fc2f7417f2 +oid sha256:c15f48a41ef13634705bebcb1c512946dd9d823183fd583c2bcb5209847093ca size 764992 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..97852215e6 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:87e36d93f37bcec407790a278553fdf369759c79d5861844c4997dd75d99fc71 +size 783122 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..eb5d43ca7f --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:81ee6135f7046c9aff87c8f360ade96ca2191a10537f6cbddefb1d1a21065d5b +size 692586 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index f2ab8cb0cb..96c79b1675 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:992b5ad0c0a451c56a1081a6c2fe2cc8f1ef768b833e2ea0ca282989bc70de1a +oid sha256:bc00c662db2b622a6bf5b6e4f287edee6d026383296e759a99ef574fe21aa3e7 size 768100 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 98d66df0d7..611e858fdd 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:02d9943dc9782e20b2cbf6c9f1d368271ad0648502bb469ee7ae5f87a03394dd +oid sha256:3f9d2cfb2ff1356768425f3da9252563732c69a4b91cd3f2e187e0e0093627a8 size 751320 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 0b562f6014..98ded2aabe 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:66e89d2cc01030f7acac5e760614f592677950097bb8c9edf7663452d504f8cb +oid sha256:842ae771625ed7e8d8121ccd10f76ac2b415a839197ea5eccb72329aaa7b2004 size 787014 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp index 6b11bb822d..939170594d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:77ee040debfa4d27e878e326eae3b20db377397edd2b3de6e2c776b2b4514d48 +oid sha256:c976301ca9e2997dc9f2b4488ce475c8847d775459b5519e985a4b47cc978f01 size 752246 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 75ba65bc2d..7021896e58 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c196852be8a245b37b89401f2dadbe43c7d95e801c3fbdbb7957f6dac20f1a21 +oid sha256:bf5cda6ca6c0387233f955c1e108a22681a5cf66fdd28eebe308562d90ac2d07 size 765910 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 314c993fdc..296845ffd4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c3c2959a982a4f5a7ba2bf74f7821552d85d11efe3ab5821f6a502c9c03ddd93 +oid sha256:0c5dd74f13a327d7687c9d250c5058652176ad670611ae810c9877a9c9d4494f size 664522 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp index 3de6447898..26711fde2b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:60bbba78a87f02af924639e55926ec4220f223b07237fbfaa26d9dbef94d3ce7 +oid sha256:6381915a2a7d54127b713730ed7155437d5a1d6117fef852d00a639490d76b13 size 850220 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp index 44fea1c7f1..a875deaf72 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:013e0a2b25713d03f4d76933738278c4572b1cabafb8f70f42564d347240e8b5 +oid sha256:e2b9b81368fa59ca18054999b15854631c72f64f4cfb3f900158ed8a092c991b size 659990 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 6502fd1732..e808ec4d3d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:36d0fabc1ac860bbf7532b8a750a23b7f9fd85fbdc58fe2d85f7eb6a658e7eff +oid sha256:418f6a5a2f69b9b53079d7ebdd3a304c5156e2fe69593ae4841e3585bf22214a size 715094 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..bf0a2525b1 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cb4578e91ae0b55685395709d04eb5956c82aa3ee36f56b585b372f8956c32b9 +size 804304 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..4c531ba02f --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2cd6b471baa3a552c1d4956cf493c0103034a7f5bc2c6f88768f2430f6c838e8 +size 711944 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..f5f46ffb9b --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dff6b96cfa8978a84c3e33a8d52a702c3f90a3632fe16c049880a870f8f43d8b +size 637932 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..c046d5b4ae --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:124b5eb8f1f85f040b1c57b16fef6d5d5f54e28b14ecbc252aae73c711f851a5 +size 602409 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..94a8406ec2 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c6da19adc7c6770ddb371ab290109365c9988918edf75dd61c5581364c71d356 +size 620412 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..e9b589ec53 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:51d8c65f56f32c401fa0b7737358d6a6916e259de9ffe546c7bcef6c673aac94 +size 591303 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp index dbb0c49238..adc226f3dc 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7ce9822e336d361ef608536d1201ebcf2480654f8165c6b4c6282291394a642d +oid sha256:1399bf9f9e5580a027fb6ba200a30303b68b9f6d425540fece8378e7c18c1d2f size 787062 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..1eae1b8ab3 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4d294b8fca1be06342a440e23852f2cd27aad5a31a62557f070a962eacdd533e +size 807510 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..27e265fa6e --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0c9cc24d945e0e757097a6619db9b57092bf1f4b8508be2d32528a4e61dc014e +size 716136 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..4c1eddf48e --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1bf32623b5cb8ddb283996ac9cd0027276b5c3e63a23edf6b71b4170be856ba1 +size 636826 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..3b93d2e883 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fb08ed74e1c3ac49e200c385c8ec6f9c795882701fe1a2484dc1bb6f68a540ba +size 533711 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..aaaa383a42 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4e26dfeb627e0132ff8a3d2563848e44b17fa61ed575ad10b15b73468d7f5420 +size 607273 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..9ed8e6286a --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5ab23057841bc8f2573d7e6dc5d9bd6a1808045a1dadda86a1f7fa1dbe8e3bb7 +size 507365 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 0b91a845ba..4954200137 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d2e9b48b9af3995abdb1438752ca1a49ad77cf32389288cc9a9a4c1eeb6730b1 +oid sha256:a63c4cfd648bb6a6d08b69ef46df7af092943bef6641299a3ab687f05b875060 size 796696 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..5004b4a050 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5aa6ad07d1ef3e0dc3157cec2201ec25e07a6236df701aaad255adedd70a88d5 +size 664210 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..7eabde5b83 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e6af81325680ad520c0edb0ce6f4941586a1b632e340baad75bccd7237ddf0ee +size 627850 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index a536acd634..fc935dc5f9 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3839dd906024327571a79f26a1309131873163ab287d6807809507add15e2e2c +oid sha256:1bee96e7cb16b388bd3c6d04c170628c88effb02cbef3bf531587a7771f86e48 size 779916 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..81cccb23ce --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d58edf03005ed7b341b4f5bd545efc99c6854aa166c1ef5364a58be02020acbe +size 645902 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..77c13c4286 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:44808f75a32c2a7d3621b8f20ea86c8cab63b19ebadb65e9aea2a46d5774f53f +size 616201 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index c429494748..526fb59519 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5fe56097983ce4b6cba5e405735b1a69f4ce2d389f70422445b86200bdb09723 +oid sha256:1b2b6ad47e45a61495eb099c3ccd12ac698881582a143ca23c56a3036c04a852 size 854288 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp index 7bb30e0918..8998710619 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fe7a4234041a76c855b46b4a11833cbfc4747cd7b3d2afbdae2de3497d0c8265 +oid sha256:0dbfe572b4e0c570faeea6a4254740d1c8f90002e14341128388ebd9dd903146 size 800328 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 0c695806a8..6958c6d7a0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6cb013a19d65de172e4eef52e35b7125cb55a590cf346e1ebf97b7ef2c8eb41c +oid sha256:a84f295030c096e45e6a5a0ba9b5981b8c215c952e0d9928ca024d8efdaec33f size 793666 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index d783cae942..11df6c840c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c18f636aabbd4b6198b52fbcf28278745c380d2ef112acd1084bd9a44c38f990 +oid sha256:ad757ea0120940aae4e00731315681dd7fdc8c5cfb24b852db71c2a62c1a6452 size 691046 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp index e1824b507c..1fae2effaf 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b7e570486db827c6e650afdb3c2451f265db810511700c6d47b9b92df1c2b8c2 +oid sha256:aa8188017cf5d142744c64f853040e1299c5a6b04522c114b6133ae027ef73dc size 883552 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp index 41bd163d0e..2776fad047 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e02b9231584f4950bd7fe5c647ba540c3a0b2f02fbe21a70e18e3ac406c5da24 +oid sha256:26ef4b9933699ca92801cccc79c61ae8b1ef346a9b37af8321250caa13c5176b size 694112 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 0b9190607e..2a8d8c2664 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fd728ab9e28ceb06beb2d62c217f7cfeddbd68bc07be1c025249b15633cf087a +oid sha256:8be29b3795e6cfea36ddb9750deee9b5f62384940d27154b61956db5a2d20cae size 750498 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 28684f777a..9de7994edc 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5354d03b9bfa07de60349f6d440d95f38318057bf73ccbb84459eb37acb6b65d +oid sha256:d8b7debd57e313aab8f714365ae5ec47707f99908a4c12ae0961b44f7bcdc50e size 649504 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..9a8f27c146 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1d465d6eb0e1db90c2c0c971be0edbfab624ae337b2d5e41d6a1f13ed898ebcf +size 872516 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..37783f61ac --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:377a81e97d911df8aef583cb3837d30c839d5cf20e5c505a2f25edcccfaaa717 +size 783856 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..3b06074e22 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:233a18522acfd65d03768ee71373097c6d99620595811317c57fe8b43b618515 +size 660342 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..2b7eac3f34 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8bc22db5822dd697a78fbc8e05f9ff729f6064ac36f45d55d10384087f64f98a +size 555253 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..bf16424309 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:79180e252cd86268d3062ac33eb39ca4452f245acdd709b3654d55ea4bef6ec5 +size 630346 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..f5baf8b205 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6f84582fe48dea1694ef5c7697a46091eceeb5949e2eddeb9b14b5ef24f75928 +size 528119 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index a749f776cf..114ede9bf5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:aaf3387ceb47bbc152e169910ecd51c0fbfffe39e1962b8165f3404abfd30f14 +oid sha256:34cc82d95258fda837b8f3541142698b6cf33b013d036eed6e8b7b692063f166 size 648828 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index cc9c1e2f2e..1beda74b2b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:746d2e57f63fbf28d5cca0588c76985e74735fb4357200d15d91968d8b79d877 +oid sha256:964286650e3ea3c254ae8b9ab690930fdf785e48b0c6e43b42cd367edf82e817 size 675442 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp index 620e115415..6f62474cb1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5be681c4015d22bc027e04ef890c0d76960c4645fc3f6165ed3d03fdc1a463d7 +oid sha256:1b338566715a8014c78a2b6515e7bd62ec4670fb63ec17455a5f16f8c7f1e08b size 652032 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index f5af4945e0..b993116e57 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:77d5ad29a8bf95223c9d0c451c4c354dd94331e3641f0adec9f5b78ff8bdb519 +oid sha256:c3c56b50c4e6baa370fea65a2830ebccbcfd8ca9d86df56d278d26554ef88e7e size 673220 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index ad67634495..126b2805fe 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:75bf017884352c9021a7d1c0cb167e86960f06227a9765d8be2314cc0a15732d +oid sha256:9ac41d7acbe00141e6f4d586c5afd3427e92fd70b8db6dfac33759984e66335f size 717976 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 5977c662c9..045d22cef7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e6d47942fd2fc9998dd0371c955f3c40dc027b213efa1c2cae8676c1777c3035 +oid sha256:30d553f46a5f3e13e1dd6498ca546ac8215a792cfde436a2a2aa28bc338efd85 size 743604 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index a79b7b5528..5b54854e6f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ad8faf05bbf8099897d5658414d03a118a19f3088218c86b88bd459e94e886b2 +oid sha256:a1851f0c799f099a9321d34cecbc9487ac6fee15afeefb2415b1dc47a20b9b69 size 733930 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp index 7f3915d6a6..0707243b2d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b475df2fa70d9cb2422ea91aa681150c30d70d6d3186dc49072cf3bed6ec0393 +oid sha256:72c845f38e80b16a54901b76ec99523f607e7ef4ce3878133317bb3ba7c0c2b9 size 724962 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 9678acbe08..7e217edfcf 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b360583c22fa2b0471a76549ee7ab5bb111765e2fc042e644969e6696362ed64 +oid sha256:628449577d68bb6d8153f5cbb0217be2ccd5fb9e0249d486cd4b0a65e5adc5e5 size 665268 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp index 0a6305cd7d..58a0801805 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:284d6361bf8f522ff032a223191a2d7f7ad223d080b052ee5d8a89a23ba2c70c +oid sha256:25c840a2d2215c2a12938ad2df5c91733b79f1c9fc7832692b72fe3f203a795d size 729302 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp index ad1d0a8ab5..6a788bef5a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:69ba97f8587810465b626cca62d800ce456813be220e5dfad87d824ba558f16b +oid sha256:1de97d942b655425e02a8fef57a1d2fcc46318f7c9c5d1331e8f8c8758d759e0 size 659692 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index a098282743..620ba32bcf 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5b58628ad3b720c506df70eaddd5e6a3269cf039120fbfb652f892b186496ad8 +oid sha256:bd8abe0c795bfd75569750c3e1bfe74d1e5d5b9be85ee320370e9dfc8bad5a79 size 570341 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 22b59db7bf..1efdf0c693 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a194fca01eec9d76e8809d3faeff3320f037fec242722b94a41cf362ca207a7f +oid sha256:cd7097ddd72d21c923c53d34fdc26511a514ef2740f4d530b6a066069bbcea36 size 620076 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index fc33d11cb9..85ccdc8e49 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:899a388491f60a2659cbf148f58049c6675e41e846591d698f632c79cf30d693 +oid sha256:5e68638cfb1c2919d4e952db822f532fc3ddb719ecb7c3101a058592a8b5dfbd size 759606 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp index cfe5aa7d23..1bb38bfbd3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8bb9fcf951a0957dfeab5cab4770e76522dad846f22f1e920eab5c856fe5c4ce +oid sha256:14542bcaa81c78187c1ec084df077108f53f24a4474ff33cf6594804f591893c size 707598 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 50459ed86b..a6666b553b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5bf0ee5150635a4384822d796cfadde468b87a622af5a94d94650f701c79b743 +oid sha256:be5683d30c4c4741178a0015122bd76d9a645745e396f1b4dcb67c732d837677 size 616321 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp index 2a7f5f692b..715e696b5f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5959262eeabe6265e0fa748c6f0a6f8e2aec3c77f7bf0a13eccdbd360a821592 +oid sha256:36ffa2f3a1248bac02b8b9b5eb04d53f767183864d1808ac2af09633c8e79fe7 size 735642 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp index 1a3e42b796..dc89191172 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5096e39f087fbcbb30ff331ffbd686ade75cbfeb7b98ffa4cd87ab77592edd54 +oid sha256:148d6295df91759f63627f48b4242bb62c36103fa72dfe0594b4efad865c360b size 643480 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp index 9d24e1cf0b..a7539f3a7c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0f13410662a96a7906898780958c4f85b6496cb405f451afde5257eff0e92500 +oid sha256:d484dc963de1be08ae7183c9a016d8475814e6aedb9ff3367729f56875f59eb6 size 567185 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp index 8f9a7db368..36f8868d83 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0d59c2f9eccbbc643dd69d1ab03b7b5261f8a05d576e9477b2960707940618c4 +oid sha256:27a4ad22c6b862dd43b3e7551e98198a1c8f3c56d5d135477bdf4bcd2037c701 size 545623 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp index b8d938b87b..8393b24353 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:29d3cc7811383406c0dfa635e5f8ed0479162d9c321b8e737469ece0c7af8c73 +oid sha256:9163b31404711092be4ae104360b45342de984438dda56f168c615180dfc6c5e size 557263 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp index 43e8adfafb..a40a44c476 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c42638a88be5aaf27d7b43a36f49190f45480d5b29e92e7ea635fdab792c92e8 +oid sha256:6c78ad0f9fd48ec8e82ba912b6d72aef13358c1050da327324814657730fcb34 size 536885 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp index a9e4a5818f..ad250a479d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5bccc52e1b520b821ba6cf9498e08cc0086e9eee1c20425e4963a1007d58e844 +oid sha256:208cb927cfbb31eb4b0caf1160f7df37b20c63ca74ffa6aba8593d6b9ca5533c size 730868 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index c90479ffa7..b75aec5cfb 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1b97fafe0f81315d29c9d1deed0423f462a0e7e457bf152f9eb979eeede53a7a +oid sha256:5158574b082fe2a19a0d60f4212e0b8fcd2269552ad249af5c69ba100e1c71af size 757730 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 0fc7ffc8e3..9cc03c9892 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:81798063f731761e313b4c7dda82fbc601710fdee482d31e1500e143a84c6583 +oid sha256:0141910df455247eb36bab31c5519010cf36d43e7481a2a6a52b4b909e82e309 size 587637 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp index 262404a72c..a3521405b9 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8a7ad5c65a4fcde860bffbcbefc84fe7c8934dea2639fdaeeae5e9c5289209cb +oid sha256:f6974ff2a3d8a4e0697346426b48e70c405cefbe355e9fe165913f9116be6b54 size 484573 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 9edc57488d..85bf194f53 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3c49aa396d2b4a88689459f821bf75c52ba60b16739826b2e3040f166c2437f4 +oid sha256:b1e3c2e920e7cadfa49ce232c1b708412bd85015928d0d92d343006cda2e269c size 562131 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp index 0c1c8709ac..165f9981a9 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0f468526a6e37b8bdf281513afc82dbd2c676191da23aaa9a80579cf38da92f1 +oid sha256:a5c8b8959f7745e6af1e054632a08579a61d328dfc27b8f638d5b21019ee9f83 size 461679 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp index fda3620836..2302ce4e09 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8e21c2d3a258e82528ade639b06a7bf0b5b1664d12ba76cd8de042eeea084ffd +oid sha256:bac647f11f1a7a2e88c46f6ad20823558777c280dabfef4eb33bbb7312b54b8b size 593463 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp index b32207281e..a584c2ac18 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c187d35aacb6f99e394312956def01356a23dbb4d04dcd85358353c6de9c7a77 +oid sha256:4d5b1c75297e957ff55460ff4431b5caf78c2094200ea58895eeb6f27925260f size 570275 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp index 7a28be01df..edd420eeeb 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7a2be9a5de50143de7d755db2cb313d317a276d022ec56d29d07ccea421d99cb +oid sha256:f9ec871f29a2c21959ed5b88789d037915e0ea5900456a117fc0ca0c4bd07922 size 583541 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp index a708d8d9e9..e88a9b0b41 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7fe3e6e8d3e1dd0b55d29a4570809983ec78900ebb974228609878e56eca658a +oid sha256:abec1c0e084bcf1d06c9f0df5819cf001ec67feb0039db597670d5bde292dc8d size 562375 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 6c60a718b1..dbd7091ef6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:02a43111ab675dbff504ceeb5e3ff2b311b4d7f20ea1ebb40036d69dc635ab2b +oid sha256:112eaaa2d58481ea06eaaf08c1732935ca0190b085aec022cbc00a16673ceed8 size 805100 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp index 2ad7c2c5cb..d43f32ea39 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:22b8169701318dbdc8adf07cf91f561afb93752e91d450f0f7f9946ffe7c63bd +oid sha256:18da96093a1d2c8d755dbaf0dad9367f2dff2b34ddf6b43088792f319bddd2e2 size 773786 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index e9dd296869..14cac7555d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d2c6d16f4ad28902e34aa2305ccf778d6740e9e1fc05e9ad6445f7b38f077778 +oid sha256:819fd1c185f9fb75d9822505db11a5c1b91d6fe0727bee76b15687b69b632b02 size 693024 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp index 1fc17645eb..63122d0814 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1518c5981573a72a18d085ee13dcc279fccfa3aa4dcaf3aaa5c540f7a5ba449f +oid sha256:d0762f1e5eefee73e2c4dab2de87147166da374b063841bcc21396e9c79de20a size 762486 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp index 5d7b13896e..be59539e39 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f83bd2fbc403b84f8995881e033e9ca66e703c855adcc8918c4eb15e8c05ec96 +oid sha256:74a8a4c7f59c75f7cc1036a9c6a990cafdfaf4bb264fe47d227bf8e9275cd1e8 size 696478 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index cfca9ce4c9..6a4cc39786 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9f9a526741704cded156a4dfd1ab4776bc4a8da1417acd6df39e2832362ba6c2 +oid sha256:34bbb811fc5f4fae0b44b91baf36cefd41d051ff223e1c1b44a72405a5f0d58b size 608951 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 06258d669a..8e68204a04 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:100b64893b0abb8913a7f6d3a8c68200535c6632a1ae5061d826aed90791a4c0 +oid sha256:d1ed4d5a4f11d959ae0ff09d576ff1fa88c537f478df60d18b35060eaa3fde34 size 654986 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 7b9e6e7191..b1a4a9b6fc 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:866a5210443620ec9d7dd21a4100aa5db794326f7bcd3fd933f9d7b752901830 +oid sha256:edac3d624c83a175e9d1e0d24a18001e9418fc36eda6d5c394cd5d4a394af168 size 831862 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp index e092bf9656..176bf208a6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f9235035a3dd226f6fba107626036bf1c5dec8b27b45f9879c1fd18c2529d1e8 +oid sha256:7f1bfef2720ea6fb993e758c6170f48aedeaa1c03b2ed2e25145f503965cf5ac size 612289 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp index eb505210c5..91fb9ace69 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b2ac32cda065926e8bb3675731a151a3c127a053deb0dee0796c45f94389d465 +oid sha256:630dab832c05fe5cf9b11c469bef26a02eca2d10d6931d6b721d2d2101b04a7a size 508287 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 217c612136..7fee964176 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c15f290643cb5984855c7c01a65cc0b18785f763774d138dc66125bfd9f2d059 +oid sha256:a5ed5acd6d92b1c5e910e5caad62042fe572be2a667653427aaf3ebc4f414b11 size 584809 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp index e94649aa42..ad1ea2b1c3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6231237a78736a07313d573b953b95af4ccbc6308876ceeb7aa37de9ca149a5a +oid sha256:f7d1cf7f0533854d61339b104553c8f3b64f1fe4d29676a6bc023688ee68c6c7 size 483765 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index c579ba2af6..dab08c1ed4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e249af99be5bbaab0ec500d5db4964046d740fec4415cedec0e6ee9e970c8b2b +oid sha256:a244ba7dc85cb201f854626f366c27ad43ad4d06272a81e4fd7292cc3981f2cd size 693476 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 49d56cccac..d5db8ffb6c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c91e49ca0f1aff445d18ecdd9338472cc511fe1c26ed74fe71a45784aca8d731 +oid sha256:15dc7bfb5dd26f4f952578ea9f3ec88a762a9b120bc7000cc445f78af9a6a71c size 718462 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp index e93838532f..86d6eb783e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e27686853f03e3cf9dfba0d46e638d4d4d37a5ce652493c6a61306176f667ef9 +oid sha256:47c4b5809baa13fccf685d61a5f17988da3038cd912c0b01b32c2d6cda0d35f6 size 699196 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 38d0b43a35..346393dea1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f53d6787d526a9713e911b59c0bb101ee5a13fd3f5edcc16c146e758d2bd098c +oid sha256:cd83b1991d2d4bdedb66590b0aec09032897ed010507c7e5ae6b965a036f368e size 717080 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 7e62a80c20..0b37b53aea 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:de0c6e140212bdf30ce46744b67bc7c8ab34c5d33452b83cf6f4099d133bf1ee +oid sha256:afe0d0c7fcb1b8977f99f18fcfd45ce77d72c1ab2175c3eb5ce5226c9a889864 size 761884 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 4f0db57d95..65a73ea72d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:35badb904d0344a7c0184c57fb0648b3075af93ac0956040b004df550a1af5d9 +oid sha256:24e14d04e6f7c4ac96d993974d1e038ff9bd18ade3ea380240fd49c220a4a814 size 786624 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index a2f1918b7a..b69cc0033f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ce02842aa8ab9258fa84cd4eb733a87714c5da28f1167611b8706738c71428d0 +oid sha256:69ace207004f4407a1cc9c978a24cdc21d9fb76f91154d3649c6c7942e25c8d7 size 777986 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp index aeabb0d2e9..ad7059cf26 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:39525b34cec35bfb6505dc60a9fe42814b73451bb3b65f6ac0bbf771e122d425 +oid sha256:9a226fb72a09dadefff97199bfedb9b73addbadb3608dbd53b908db4fce3613d size 787026 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 7e5c4390bb..7028ec3f7e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7bedf6378f88e1045f5fc25e003cb2fef0f038c849850dfc96e670518786fe97 +oid sha256:132cc4325a41b4e2daf197067f1e28f69169e9c087ee488490993612e71e9276 size 680514 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp index 85be56802d..af49a29780 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:edf296deba04285629a9a4fba9ba855b8d8ad4f3dc2d9f5c42a1e3e719897026 +oid sha256:b0c8b253486f17cd0a358afaed1ce92c6798d6bf65629ebd059bf83302b8b912 size 762850 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp index 50a4531ba0..d340885d30 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e71ba0fef66d70c23d8ccba85ddf20b636a5b43b11b71ef5d93d35a4092298cb +oid sha256:eccad437a03b5b5c36715f2299fc8f612cb0ffad30075cebc7dcaa5d3daa921d size 681550 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index 0f92bb6b1e..9ef0135ad8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:497292f95714cf76f7367f3db954aafa9ae2d219e3f238bd0cd49c5a8eb594e0 +oid sha256:50199875b7c5221196cce495481c8cb5e3097c33011a13e9e12ff165d7316716 size 592839 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 87c959fc7d..ab31cdba60 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:78f7d15d2f16113abaec3d4d2d42d3b8a199b7ae1971c930eddd1dc07dc34e61 +oid sha256:1380eb11c5ebcc87ee704ef60316d4dbd65f6cb5112776b72d80d7da3969534c size 620324 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 367cd39b3a..e18f5b2c43 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3ab39e18c8379ed641b1ec5a7c36fd72bdf9b48773db2e6889c68c8c60f214aa +oid sha256:e8e070225071fb1871396df08b8eb30e69d17fe9669db4e1811c0cfea697b582 size 802776 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp index 5df83c5030..976c8c7587 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6c324a96e5ca9a89cb6303061c5c937f3a15a4f07b096a184f6d4bc5184662f9 +oid sha256:5d87e7db044ff58563bb8358f600c741401767e8f1d26e6555e78fff5d4d5f04 size 768970 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 1695291b65..4bfd2681b3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b4eb4601fec057644199548a5be255844b1b78e8b42e4fb755205f2cbf3601b3 +oid sha256:186015fcaaa860e705402f0502fe138e0ff5aea362b07ce035eae98050eac633 size 685292 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp index f12a1d3a1b..b19df08981 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:442783a9912edd0761b3b89d9ae0003e674e6432320eb9e4cdf1cf000292b1fb +oid sha256:18062d6b856e3d2c627ba9bf50cd23b335b548bb6b653c10478dcf70fbfae0c9 size 797064 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp index 41cf096b79..94b8aeb143 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d71824cd290452c55c3734df2a017fd468fd643a5ff019b3b5ff98d89a6c4006 +oid sha256:f72c3ba101efedd5e8122cf18e632f9892e4af77c3c37e7c523bf16bc7d24ae0 size 710872 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp index e36b07c97f..f9ff600d25 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:55a582ec090b759a302be8196490749ff241f79b934ab39a1fb4fd9bb2b4250b +oid sha256:d4bb43875831dbfa557a52cd1ec0ac4d6797ab5b260cd25098f2309cc3c21088 size 580407 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp index acce4141d4..070f3329f6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:478b52eea3288f890995b10b60137557086a7f657b8f0f4c7ecf437432e61138 +oid sha256:585c83087bab666cbb291d2a56d77786d347032a596f61e496852d1bbcf03fbd size 540741 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp index 2b80274e3c..c8db54e864 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:37a495fd582cc1c4f52c855daafdc3ee852d9ab56a122daf2d77fab43f8ad171 +oid sha256:bc8055eabf24450ece5866e02940e285b687ee2db073347791a74e0e297e062b size 566341 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp index e099225819..281a070afc 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1f6efbfb2fedf195f666e8238117f5fa20e0ee0dea56c12e0b19ac6eb493591e +oid sha256:86135e5b86c48927ccc95ed63a218b5751e78981add0fdac760b87b13d84738a size 532743 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 69ed44fe81..f50361d4f7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:784ad5dc359f570df766a8159b8a2c2f8f8adb06e968c122da820d8141d06e5f +oid sha256:cc7bc85866567580bfbe5661c3c949f8eb1f64281a2f8be7c033d0cd1685f423 size 777984 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 6da18f8af0..24fcbcdba0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a074f97f30029d72b8eb9e606bfeddb6f70aab0743e217bde22edced0cbd82d3 +oid sha256:281fa59cb08abe2f6a396001e356653b08f162fb8418d784d3da26e8b2c1ce1c size 799270 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp index dddafc57da..cf835a7fca 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:202a1fda90840cc0148a1038ca435e97501a9d8e0a73f1659512bfb83f4b5026 +oid sha256:2a94280a419732d0938bbc06b9f6f84c3c3dd5d3a538cd4cf17674bd61ebe24a size 588971 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp index f79142b207..78e2bcc859 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c939e9f227a60d96f202a97ec2b17eb59eab0c89a0e3568031138df4d4fadfa3 +oid sha256:c5f01d48558b09037bccc36a4fdfffd2e401d021388351bfb0c396b8aeea6b58 size 485363 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 029c8680de..3d67b760a0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:16e2b9fcd12d883d643f6f4111dbd36ecedc738ea59a3af1306c7d7c23ec5838 +oid sha256:b5dfceae3472b5ebfccc515c384684499b444c964a4c7c20ef5765935dd1cff7 size 558087 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp index 4c3860215a..f8dce5ddec 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0b639b3bdefd4e27671e40bd9a928b489918aa415d38cb12bd5135d090981cd4 +oid sha256:4aaa32fe7256d95599d6455e571fc0a3b57a1b1aa2e42508307b4b29a27b72a7 size 458327 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp index 0bd9fc4075..c7d8111f9a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:95ed68a415bf9dd958e4999693ba221a84f5d13fd83632f9ac54fa5c0cf96e8d +oid sha256:9fd346fa382372d85270c411eb5a3b1be4cf52c21ca095cc261a73e7eb12b6d0 size 605849 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp index 42591fc855..6dc2a45a61 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b854d6fe88694a18244a8696fdb08e54d34a376eb14679f083c6f890236cc3f4 +oid sha256:1b59b7de94ddfdb77465000848ec1b104808d8ca9d90d94ab476d808946b63f9 size 566183 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp index 27a0051ba4..4a4bd11325 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d7e82b7d258d0e4053996209e04251976476f37952e5d160afd0aef4a9b2470d +oid sha256:fa7689e8cb1283a2ee5861eb601c651f5daab5f93e6a8c9f37160662be8db67c size 591783 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp index 49985091b3..32f3147130 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:81a26733dd7e7348dbcd00a1ccad6926a3f59fc0d1c9e878d52b003521b6db26 +oid sha256:7c2f7fe85b059873608e4d4ba6d6e0c7c32fda0a711d1962b71f8d8fa5e9b822 size 557395 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 0a575300ec..12b065e8c5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:51fef32bfa2106285e118ad3f3741b0ab323657e17d2cfa90707be0287cf0cb8 +oid sha256:af1b2b82fb6cbbccd6e410d2f833627b2e76e288573e6d252bb41b7f646d3bab size 848318 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp index db4c2da416..cc8e1305c8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ad71463e7ac754deaa521c1e7bcdfdce655561023e161b4c152e494efb27fba0 +oid sha256:5584549ca3e29650c6805464c04200759fb18949ca09e0a210bf4c81c1ba71ae size 836144 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 90ad53d344..4af33318d0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:27d1e72cfaefbb4683c842e2632f67bc3ef2cb9ef0d959f6a2a0e5ccdd9839a5 +oid sha256:62f7933dc2abfbfdbe58439eaccc01627d12f35aa6507f6224b1204c7cd9329d size 708270 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp index 2496c32cd1..583e71e46c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a0944793752e35518b3e77d5c577a641ad167c950bcf5194fadb8c9eab340e86 +oid sha256:787dc0c3213d683db50a67f428fd589bfc15bb1e74c058e3c40a62915004a702 size 796182 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp index d2f7eb60b8..3fc8789e7e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3600815c893897ab786027fb6d013ad22c2c343a469ad9ac6e81c7059434cdf2 +oid sha256:75a69678d918028f61de7cd3f1039e141ec0cd81239b7f5c8c43c2bf7a503123 size 719074 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index 33f0a64fc9..e5fdba08b1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ba8b6c75dad5bd6abc4907bbb8ede6354f800c4cf440eec173702b3c324f2ce0 +oid sha256:e3c87dd1011fa320751b99434d4611fe9adbc8e9db1db17b51abdf5a860b78e5 size 631450 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index c99f7c5061..571b407084 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:954c153d97ece82f318a8ab0defee2310e08a40d35322c90d2697147241196d3 +oid sha256:9ac60290a2ff1e0773f9c771652386d5f9d2a8507c563d59903c6137e1e60e7a size 656222 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 6c22ebc65b..0961924588 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:79058c1c6f9cc315e88071c7858d12a0a30f08e09c25707fc2e4a44891c98d98 +oid sha256:9829acf41e37219c2398911e4e9aaa5fcf4425bb1907f686e760ac051640c079 size 873404 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp index c82ec9c9f1..f890ead489 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0c393aafde6b184af00c4401918a2bc7dfac82002d3c2f562548b5807ee2fe49 +oid sha256:d29793c08374b7eec32b100a5c3195213f11b1a100122a706c3bc0099b40c4b1 size 613673 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp index 27d854866e..b4a00ee9aa 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:68aef2b3eccd543f6d90b559313ba92df350eddc31167534a9a71937b6618650 +oid sha256:f947730057395e9b88e809fe52c2536b620814d6f0005c7a88ff3494d7062dcb size 509077 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 9c8be7225b..abef82ba05 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:05bd2103f4b26c2fc79722e45e54f4223b7f4a8bfd781e32f11f1963d223f263 +oid sha256:588359d8ad5ea1341a713dea30de68bcc04b65f2a1781b5c4d8c16c56565535c size 581555 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp index 259dc34f7d..11aa6990e4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:130630ae856f99d71e24e0d5105f5e1181908889f36b2171446705e24ddb7833 +oid sha256:3eb90c84d2fc755cff911570766be7e8a4ff0b5b123198da0afb3b3308b335a4 size 480463 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 867d729879..8e385b3f04 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f746ec69a33ba9f6599376ebd752c0f871edb44a1ff81e4fce95c2f5e0ab2805 +oid sha256:a2f8fba72638025f4c32f43aea43a395db6ece5e2607feadcffb4abd3437d703 size 668316 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..df90a052d8 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:be2f41614dd2773da31dcb4f145e9015a09b5b5a0e640089d97724684143549b +size 683732 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..2548b60cef --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:62fff0a2319d6a5e445190087cd2cbb1dc0a108763a69d06800c2f1bd6832777 +size 593789 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp index 661c7f13a9..04f591aea5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:765402fc80467f4ace731b453db38020ff099ff39918f76dfa4de26d4db8d052 +oid sha256:82b799ef0144769939420451a9a5f3e818bb5f4428a82d8d085c156f6ba2d8a3 size 664762 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..fc7bbc6440 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:685601bdb9d70c5c9bba0fbb84a826fd0c788187a3e937c312e410c8bf02f15b +size 682102 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..4365adc0b6 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bd5ca5e1ece8e74727aa8f5b05fb348b93853e5dc10a97ad447b0caddb34943f +size 596451 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 5a159ba619..a6cf8ddc9f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:867907abc5b941fded8382dafb1c595c122bb6223ed76ad0593074ec621bbec3 +oid sha256:758a26d01facd2438d46b45308bf50ea10382f7af3d9fa58f1bec6453d8bb147 size 733024 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..27a74f254d --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:514461374543fe058e45f15f289d806982097c57df13ea18735f5bca2b592e9a +size 751154 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..14ec7cef22 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:87e840d464e3279fac72532adf25be74da4c4fc2abc01474ccb52158ce95c482 +size 663776 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 74c3ccde61..340927754f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d6257ad2af2582dab59eeffce6cf724ac69a26943ec3360611429e5474b2fd5b +oid sha256:a51ed2b4c4bf62ea7d1e1f71838217685d348976dca2d5bec3d5a83cc4d8200f size 768002 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 3a8b76f940..4abb185d2b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3b2454d31c1701ea5a45f5a263b0653f2519fd674829f20ebd3688fedc2f4507 +oid sha256:21f39d1f8df2d6ba54f6fffa1ac80c9b6a478caac3ccf6f0de3c39cb661dc00c size 751222 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index d4e8b34966..49ca673b72 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2a93431b14480df7d8c6d768068099c29bdfba349a65cb3658a8b8e3d2656445 +oid sha256:a821431574468fafa33c0621a128e711f585dd950155bd3eee77170807f6f550 size 755836 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp index 8de4fb5d9e..70d008cd4f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6b50dc942d42a05e88bec22b7e595ef5d42886162dfeaaac34e78cd34eaab566 +oid sha256:22f3ada162017b5e15bb210937d9372ea69de9ea8fd64ca3398395ca768c10fe size 721068 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 901d2eb734..26e059391a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2f50a24951e8b3df8027448bf2d2203822b6f632a54c5d4002cc02099261d6ca +oid sha256:57124335d9a22e7d5b04739c7527b6f21382e140f35158d9552d9e080bbd09f3 size 778194 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 06df9d2945..b18dd2731d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:897ec48208e01bd65435f7036748af01b026e8e5cbebd70bb4e163b3ba3d7d1f +oid sha256:8f92212af4a75881e0c45ef8b337251b432ac7771c3340ccc5d4e7cb027fd575 size 673550 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp index 52b02c98af..878f819512 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b331d958a93aa5021bb303fa218a13f6dfd22c2799c623e816b9b223340ee941 +oid sha256:ab016c3c76206f454076f7a7288fb058fa7d84316a0bf00922a84d12702684d7 size 872716 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp index 9692e7577f..0b70e0ea89 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fcc171628e6a4a269717853fd9e7f19b192ae4ebf539a54126c54c1db4350024 +oid sha256:09a469c0127a022c7c1104031756bab51daeca9ae492d193f3c2e3208bfc46e6 size 670400 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 3d8d375560..10c95cacb8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:72aecc935abb9966b1fe15ce019567a496a56868df17a17511229e58791cbcf4 +oid sha256:e7ace20aa585623ed5da24bae7526628925011be9e4154a2dd21733ef5abd8f7 size 714996 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..d0791b081b --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9c120d8aeaf85d2e19f42b8308d51c533321305b41dbad41aaaafb6e2b1e66b4 +size 772386 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..0226c1d4af --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f2a1d22728490e41ebecaf7976246840f34c18f008249515f3c32fbfeabeb942 +size 683134 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..90b8952ca2 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:405031492673ebaee5c1f1160eb55173c57590cc57b4a0c2be1071ad3dcce9f4 +size 638622 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..b4fabe4d51 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b836b4e7a3db1c14f05b4418dc793293a41b6a0a172cb7b869576d6188a05284 +size 603099 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..2bf3db98e6 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1024eaf1d58d7f6c0b867a1d0bee112061c0849b7a7d84ccae4ea96d12f7478f +size 621102 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..008710459f --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:45fcc374245596dc469e1397a478b7968210fa61e75d23d5637da15cace3eb74 +size 591203 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 0d5da53b6d..a3fc5e2368 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2b985bb5dfa35212f567112b3d7bb5f0800250daff4ad23ee9015ac5921d4f69 +oid sha256:84e16c79ac57b746fa56faa1fe96317ff4a28a50a3156df40e6f3db0ed3f2847 size 755142 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..65946ddcfe --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:becbc4dc8a0dbcf20d0f7005209bc7d9170e696c20ec81e118a5143a220a13c1 +size 776330 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..26bc6668fc --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b734d82dcd74cc25cb53cb94aafb50720e8ca2617d31934573074d9495e7439 +size 687424 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..eef63b77c9 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eb60dc0aa9c8811c8df3a5e172845d226d8758d588893215357a381e78c471a6 +size 644374 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..5aab60de23 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7f7aacd08402cef167e133f1802049501b7facbce3b5832d1aace1717d1d0fea +size 540913 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..31e8439823 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b1bdba1d6054c491f6a40cbd9deb662a3bcbbafc8748f10c3c49f20c6e2dcc4d +size 608653 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..7a23cbd079 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a5364ed5382bd791cb275849b56f78783b2855940d831ff9cc51e14ed722126e +size 508795 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 3aff532dd1..47c708b95f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3a4aebade87a238e8561df77c5e85b369a4fcf5871e4728f9807208162f3f2b8 +oid sha256:af6cbd631e0db4d07dc3a932a2e699ee7cd8a3f465773de0f90e46b3e6a5561c size 797386 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..9e40f26abe --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cc01602f21d64866b6b017d72077a6157b8cb87eaf1e2e60b59aef99f98defe8 +size 664112 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..e7a87ec3a4 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eacf25938b5cfd923c584f1bd519d74063a10712db0dd014eb4b79f5928a2942 +size 628540 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 08ef8dc7af..0df8d147be 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:cc30d83f15df39c88f10ff665ff425f86f6210c9dbdd51e3d4db776c3a319ff9 +oid sha256:3bbd0c70668df940a4c7660841eb16a6f4ecc7ac3fdd8da1162fae129833540d size 779818 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..80346a26c1 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a40edd84ee85b46b142b60fcb19c68cf2613bd9f4b0448a70d28686490890758 +size 645804 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..63561e1c77 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:26796bd4b1d9783026a0cd5ba04dfb92547fa257d1bd406ee09de70d2f12dfb2 +size 616891 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 39be14e6af..7d90bb6f8b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8fe35c72439ca794f5561f0da934cee8c9f996583d296a42d5f5abc64a09c54a +oid sha256:6f87a862ca3cb3d40a75b918f0bf1e054a690b6f39caf9423ce8037362335a23 size 823108 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp index fde27c44c4..a8cefb1c58 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0743f645c6984e6600a308470f1da451656344053c3e93438979de8ba1734e76 +oid sha256:fdba0c022ebf05f355f68ba1e0fa6bc58ff73247dbfd16a87e1dece05a29080e size 769150 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 916ad922e2..67c3b71fea 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2f3325bec4127274ea33ceca5bf857d95e68c907e2c11ccf74f1902aecf2718d +oid sha256:9751d79ac74df3491c8b2761e3f3e22d88b1adf008bfb09b5180625fa189e3fb size 806740 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index c28b90f08c..121602b208 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:126bf71cb21e5f9d95a614b22f9ae19a4a34e59badbdae6d39d7d2e2252a00e0 +oid sha256:c9baed122b03af0643d061d9825ed5c285819fab2377978f1f8fc947e5e26cc4 size 700862 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp index 05ced6906c..82cc81c787 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5f6ff39b2e4fd4cf03ef861ee6cff3051637732e15d26e25c7ff672264eb4b4c +oid sha256:9d16bc9cd4a7033147dce5c7a3a8e4ba8593dc150039fb498ae24cccf1e240e4 size 907084 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp index fa6e4fa99e..5ef327a023 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c822e37969b4cfdc4ae854d3b2a9a8e2d6cb6f99184ca24783aa3ab8f2a13f6b +oid sha256:559912acf9b01a568f787524297e7abccde1dd421ca5ba57e3c8fedb52359e26 size 704522 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 8bbc2d35fa..b05d02f16d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0c5ef2de65d30da4bc0fb84e8dd4faf8d131a813f1a519ff2b3116df1a14c836 +oid sha256:40e15c2bcfc7de0aecbce63c737fdbb7612636b0f4e3c2167ae77b7cdde17edc size 751238 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 93f56b09f8..b96c9fdf0d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:dcee31377b71090bae8641a6d0289c9a08937d321811b57453afe61912f818ae +oid sha256:3caec4ecd674b7af26d59b5c4668c1b10085ed8e1b5137ded75535453eb80d4c size 650788 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..adb6047310 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a7973b29cbdd438ca2ef0cdcf61835eb6885ba5407358628bd8590fe03fedb80 +size 843212 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..ee3025f754 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c0503daf2a9092d14aa9820ce1a7eabaa989ce5d73160dd502b2b377f277eac5 +size 754354 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..5750a67eee --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:03e08d1f294ef9d5cf83630a80062a633146f5431724349d50018bea49aee3b7 +size 668680 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..a48878cfbd --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3ba676ae5c8cbf0bdc6d2e4d464497c7ffd0f3cf3f857a9614789a2b243b155f +size 561667 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..283cf00485 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:05799a91169500153c8e9d1ca36869d0b7f21acbc679635eddea3c7280f3d1e4 +size 631776 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..f08e635b6e --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6c521d8a3066f6b728988d8cd23a8d449d13e3f53489fe1ef0530bdccfb034b0 +size 528759 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 93322a1b52..17ef7a3776 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:56628d81bed5d5fe6d094cd0e6523e5486dcac73df439e8cb7d858f000c6ac08 +oid sha256:7dbd1e5b2bc47c543da40a87b27440efa8d4da3aff71578b64f67ac3fa1b82e2 size 656968 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 963f9feba7..1b0fcc82ec 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:604fb324943a7e50cb08d4cfb9143b752131d64ac2fe4163442e60088ec7da68 +oid sha256:31d34d2ea418d8138b311658c518365351e80eeb5dcdfe6155a7739e50c17958 size 683582 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp index 492a6c05ac..6ec5218e1b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:398d8bdb9a00a97784b325bec1658fb4c649a008a84ef504d8b9067890ee73bb +oid sha256:92291ea1c08de9cfa74472e7259d115630b01a44f98979b736d4784283579131 size 660172 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 8a5c9c4d46..76f02f494f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7c95917a78ac6ac550a3e378b2e37912dd733ac2552001d8fd2b65b0af5d9391 +oid sha256:eecc32845babbe543e06953f62a89f1fec77b713dd1e0f33270615975fd9c5f2 size 681360 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 0cb86f0ac6..51d210860a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e70cd58dda514643b1548db2dc4f1ff16bc69e6b47af0002bc5471f8374abc2e +oid sha256:f6b6dace3e4d94307fbec90d1b8b77a923a5b684112434e4bd848d1c7fd767d7 size 726116 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 6b7237905a..90c4ea9288 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:757c4dcf6ccd0ce5da017a58f2b9f7e530ba0fc599222924daf3f02590a9ba2c +oid sha256:68ecdef4cd3433db4d4cee3ba748bf8f729a8ad1a34ca8b6bcf0e22f68548eb5 size 752582 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 1875f307d4..eac21fab85 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8f068ca6cc4af869ffc6a7548fb5eb1e8e09d261965fe2d6a15ac8d28463374d +oid sha256:a53c6f686be44bc6387f85bbd807985f6e74da4ee52b1377fe4e4de50706f508 size 742070 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp index e1eb07f52e..6db96f5273 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:802799d416126e91c4a4b5d4452747ea6ae35d20a91b61477f144485daf29d9f +oid sha256:041b07365621600daac7c5391c38bdc4380c770bdc276cbdf0dff083f992ad10 size 722594 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 280358b351..16367ca369 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:47d33ee8d845c9ff7e29563d5c10e7794877487a63345bde388dcbd10312f482 +oid sha256:96054372cc71cfc939843ee353176248d216c804faf2e4913b50ae54a6bac495 size 666056 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp index 7babb52c41..0704fe90ca 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:153c3ba0108a02a7d08e48cbc52ec08254ceda70bd5ceea89dcbef04a697551d +oid sha256:ab11caece54abf6b435fd190e0a96cdb255fe9ad6d48ecec5cf55ccaecc3c450 size 739564 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp index 7d28a8d839..9c3029a5da 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8b1a1da08bb9c76b0cf01cbcf7065ffa3c87eaeae23de661c655eb01e21ccbbb +oid sha256:be0398c61204a89ffc1f051c400329677f48f47b81df8c10837863e59d062717 size 663738 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index adbd483ca9..145be76942 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b4a8aa0985f004eabd961516a73aa071ca27d66a5acd81fd50f4b151f3a65619 +oid sha256:dfab658536df9e82fb54a361e626dd438129612c0bee9675a5c43efcadfc8cf4 size 574387 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 2003a660e7..4bdff47fae 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8ccb537d1f916489ffe10c26a2829b720f01f9e46a9bc25cbf375ac65baf91b9 +oid sha256:b7a0bf11fc0caea8e4683e416f36f11165045c729f40a4f8e5e00cc5f7f4c8e9 size 621606 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 7e4745a9e5..bcd06f6747 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:730f6a38b7a1d1d2f52cefc10fa9770356823b993638a47b04d824ccc5146e7f +oid sha256:165d7e518b762583d22b109f35bb83f2534a153302fc948b3b042ccaf93b3266 size 767796 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp index 969ed19eb2..df043bf74c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f4fdc408764ab004dc0eb2a1655be9813b3ba6c511fa0e99ec5f81a1e651ae05 +oid sha256:026158fed1edf0b71ecd20d23d24abc86517b4908e6d84e94a2eaf2c0fef94bd size 705328 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index cf177077dc..fd3150b8b1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4fa6e8fff6dcea7081ac740913735b597482aaafe880b2c9eb1c10f48b125523 +oid sha256:d85dbd43f99953c7f9301f00bd1d6925524535a4423f694226b1efe8346aa560 size 620516 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp index e79779fb37..802d75d965 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fdaf818cfbf323cb268d43e7fc2d7fa04e0934ed24d42de1e62ed9a05deff61b +oid sha256:a32618f635d9c2fdad6b1c176481806a806846409711285bf9d83e41f46275ff size 733374 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp index d83f4737c8..635a6b1df4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:911ea423a434769c95ade02fad475e0f8e436bdc64d9129c73a6649014e9c2fb +oid sha256:b51b1ee375a06550f14def0e422861b6241ebda0419c98e91edbbc5119a45bf8 size 647674 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp index 0398a0d4b8..0d76db5bb0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:11211dc507a7985693775f9b81e845e1f34a7d2fae14f05b15226d4f9050bea8 +oid sha256:40da146e61c55d111eacf5b48a4e9c5ab01f685535624796a229fc1748a74af5 size 567085 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp index ef0ef39b83..794fe708ce 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:98785a0bd71450dac330d91907421a649598140865bb10edd1d1050f785479ab +oid sha256:47b56a9b4fb6455af490b2a860adc44016f36819ac513504b2937ce0be0fde3b size 545525 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp index 37908a2598..010c558f5e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1a44d5e11a8cbd062e521ed3654cd632666faeafe59ff86b98de0a604aaf49a1 +oid sha256:23d49db26013112b0fe02b23ea0e4341ad57c74a6447f617b2014a22670245cf size 558003 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp index fe4bddeeaa..d90ebcbc29 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f4d447c6b83d69bcbebba4eee31945c7865509a96fe4cb99cc60911bcf3c9835 +oid sha256:e180003fbd0e889cb7fd1c35066edd2d80b1d701164eb702350e3729f666196a size 537577 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp index dfca86b3db..8b2826be95 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8a3c26e729cddc4ba43549619960a498122816a05e6dc00a328aa4a8427c1941 +oid sha256:299c1fb0d95f65c15b08481ec0e4b639b78c7f3875c6a0d7ba576f0c6c046302 size 739848 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index cc60b45727..3f150b6d4d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9db960de4680f1c4c103aeda62832df9b6ac1579aa209be2c97606c159990125 +oid sha256:02342b6507ca0282cbf0560bf097d4539b9376dc24c0f1aaa781235f15914d11 size 765870 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 77a1730db6..af460bd848 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:07362417b1aa62658296acb3804234671372233d126a7f049d4d5406915375a2 +oid sha256:43fbc888f7f5e2696dbf046fe6f3629eeb1a2c34abdda9cd18ae0afb3515efa5 size 589167 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp index 5b9b39e91f..851201fa9f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a0a9207d01a59a2cf3074fcf80b97ad06b55bdfa93456ebd6d778f1fe57cc0a8 +oid sha256:f4e2aa86f6a29407037af6e8f4f4942849548eb7d21c65b2aee28b4367807b5f size 484573 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 3846d890e7..ababdf803f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0d31dd3f6ef1650f3b681dfd9c71bbaa088c5b51200a6a03c2a60038f9fa68d4 +oid sha256:9a61fc8bce2ec7e32dda70acccaf4b298f027a0bfc7f5f6376d72b9f7b104b30 size 562871 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp index f26601d271..20707baea6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:41a0f162781e0adba95e041ffc9886291efd86b5577385f0f66e1beab243f527 +oid sha256:dcd88f1b30e8cb8a717591e0bbd759c1bfa05e339cc6b9923c822266292980ee size 462419 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp index 6b9de2d8d1..05d5283813 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:660dac486d485a12b7385939a8ef0621de54bfbd3dc144e26f838e9d8d6b23a4 +oid sha256:03623d382435420df6776b111db1c45c12486f5a5f5da7beacec820d387824ab size 593365 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp index 5b55db28e1..73b7f8f26a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:532b66c77e115ab0bf8bb2bae4cb8c50a6eff71e04da2703219c58654d193069 +oid sha256:4090e78c64b70abeb6c1a64a8ebe47b5a3a4760dc34da6353601c9a88ed489e0 size 571015 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp index e994ea1ab7..a60bb4fcf1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0626888cf698a977eda0eb56ff8af2e08a21ec27c2b453c20af5430c3d364450 +oid sha256:fdb96bd4367d447f6676a87c4b7816d5a255e02d39cafd2c739f71b60691f0f0 size 583443 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp index 207f7e30b2..8460091a27 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5aaeb22cbf05f820dbc19959f6bb74cedfdfb12caac9ff337e70f90630c015cc +oid sha256:c8cbbe83b4f6a3f656c508fe64f20fa7d7a2fcf5501ec60327a8db6e734a9003 size 562277 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 781d07c0d8..9a0e3df285 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1677d07df108d07869652fab5d6a1d0d622ad625aa4118d2b636b466ce6cd364 +oid sha256:60fbec101d66f64fe486d56b7de46db00dbb3b3340547dad6e597dfd739c2aba size 813240 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp index ef2916898b..6269ec469f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4e89fb82915769db59518bfa0fcf85296547156d048e006e3fd06d9042dd1503 +oid sha256:13e54dc063c9eb48e169c41be659e369c27851bd3d85e353a6d7b7130f03bdcb size 772404 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 0a2774fd4f..f126dd1f0d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5684ea44805d672edd777a5c5bffb8dfe0ca1f0aaffc7b41cb8c5304faa24530 +oid sha256:189825430fed35ed941a166089ea6de49d1fdd07d0c76b737a5faca72cbb940b size 693764 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp index 28c9886476..e4c407021e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:463a7f2385938e18f215cbce1c61b69d9fec1bec947a97c91257a4df5f72d3ef +oid sha256:2c83eb2104540eae0cb5e045b5f0548be5ba2fa6024d9bdf03a59db7297dd249 size 771958 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp index 1301ffb98d..32538216ca 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fcf6054918fdf9e7364893b500b50db65421009ee841ad1419f583b757887333 +oid sha256:e41fc71821d8d4cd0d1d275ac2ca8e8b46969448ab0a066dcb395eb2d0b1b4ae size 700474 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index ba679ca11e..152d6cbd81 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ebb49f696aad52a5ccdcda03e3c50132b16a1c00e57128f359e5866ee29fb9d4 +oid sha256:3c0680c846f5bf7c02cb5bf461ea7e5acb3998a4d8bc837e4882a9ae1db04d3a size 612997 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 3790a1471b..d873a90414 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f765de3bcbbb9d1f62acd21d1007abb93fb06db707d061e069d96a302a6c00b5 +oid sha256:19484155d30953010696c757e7b89988a2184806e470b23ba8f68e31495513c0 size 656566 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 92506ba14e..4c6b03b45c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:205c8e61ba00d8a5899a7b6f66edfa6caa781ea3e2fe2e8f9b65d1523af3beea +oid sha256:98bc67aa51090e3a71f139610fd7b5e8f0a2c126f1a6bc15591b07f80c06e704 size 840002 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 1e4425f5bb..85a17e0bfb 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3c0dbbfe42bbdb7199326937f05f6a9d903ed1a8719c9f58c1ef268addc7365f +oid sha256:176f2c101ab72525827585582e7b1d241c942d9c8bf2ad75c02728969bdf1e1c size 613079 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp index 6f84730d41..b21cf75d6e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4eda4cc23745018f108ecb3d622f079e6bdce87a8e98d81230b25454f932a1c0 +oid sha256:9cd9a9bdc7b6851a4929be8dd3878c07304c4b8ac29aca872aa4e9ef2baaac77 size 508287 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 29ca677e73..2b530e8ed8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2b3fe21b50279905882122dcdc2a3c73cb54ff0ebc275328bba8ca6ab1d046f2 +oid sha256:7b2e2488f7e7a38a84ca0f24bd02260fac5da92a2ddd6433d19d3aa0e049a442 size 585647 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp index db2172b068..1e95fd0e20 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1820eed6bcda11e15b7acf058426df325dd6cf163afb45ba703c04ef4d55ea05 +oid sha256:02176e579f6eaf41316e532d0a54a8df61ec93a801db1214002e13009b8224f0 size 485345 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 399e1e1f00..80d426379f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2cfd4ac9cd5ea5515b81bd4237a3d81588e6cdcc80fdcba32dc5071cec159df8 +oid sha256:cdb7bcc9eba8b69b65597a6436e17dbed2767de0d6fd17d1e829bc96c762f1e7 size 755686 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index f90d5e6dd5..45213f0bc7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d6d43b2a866807124bee6434133c92fa6143f0b87e86a33624a91d115cbc03c9 +oid sha256:539a77ed2ea3bbde848161611b66808b9757f6ac66f487425a7f15e37588caa6 size 783188 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp index 2b30fa218e..903ec0b4d9 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:cce1a4512312376ecb74dfeeeaa533487d440ac64e2f8ea9f1599513eddb35b0 +oid sha256:aa319b70a5afe8c07bf61d84c935f245fda889e1c80f5c1ec7e06d6670a41f42 size 759630 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 8c4d852614..680a1970a0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:83840c46b36a2a1a17ba566e92eadc7337722bab09e0e6c3ec401589400e045e +oid sha256:89acb63f634e082c274fe2453b457aa25506a29bbbfa664973be7297427a72f2 size 779684 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 95cf6732d8..32980d7ea7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:844f4eaafb3701e6296300597151f6b6174334dfc6347719193930e7fa2e88c7 +oid sha256:a21d223ac7604ddd339a2ab2fc5b07ef19742375f59fd26220b04cdcf12bd2d4 size 823650 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 1debb18ac2..614e9d4093 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ba6f531a70e76689c4b54d4c086da0715575e11d69a73b5d8473ab7c025e2d01 +oid sha256:c02c0a35a8fc88af360c17cfbbbaca80da6a0506799141639fb102bee51ba529 size 851350 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 7d676a6455..10a0ecd59f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d595dbb0d704d93758d55869cfe8370333c57898ec78576f2aaad0795169fd60 +oid sha256:b409b21cf254491250bb590bc23d56b3cd47a22c26fa500e6bc3d5838df7cc63 size 855094 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp index a19c109637..1e93434ecc 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:980c76d6943130fe37b51620e7af17bd31961aef352a8373479466726fa6e6d9 +oid sha256:7cbb89582b132bf311be5f17c6b1e9a302421e0e54d88d23246ce4e2d48ac6e5 size 836310 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index f1c6ae6d59..6959621e30 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:78c22f10b002900ce161bcef9d5da297e34003128c2a1750e1ab31620f9c84a0 +oid sha256:0393d08c9de36ece02bf564c65c858d06a011235cc7a4b73446bbceeeb6787a5 size 717464 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp index cb7677e9ea..1c0623aa2d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0f055cc31e277343e220a83b9755d330bc7d48e5eb20c8e7aad12718b8349b6e +oid sha256:ba3c9d8be3229b7ed5e4fc24ed90f37976e840d756db7259ef833e573e081e35 size 805622 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp index 94ed4eb522..02d198f0c1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7d0bc99068a2b7ff7ae0ec82d43fbf5bf8fc453675850b207462e8f34a7dd28a +oid sha256:bf5ff5e496f612e285423d233a538a2bf3a192e8c7b71cfc8d6104ccd47d2969 size 724174 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 7770ce50a0..066d276701 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:36adaa8281e4ad07bfa6d31c911eac1dbc72ba539ab25b782a4df40b4f5a3424 +oid sha256:ea0f9a60f8ddba231d45a09f246905c5aff3565e987fab42798c61027a7b977e size 655844 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index d611219e9d..081063d2ec 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d3d05da62fe0aa78de641f34031e068768a74af964ae9a4971017f4f7b83d0aa +oid sha256:f925ee9be272ec5119aad35cb1ca23345b1ff754e110e7b65572c17912883e0e size 880130 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp index 49d86f2a5b..a5142b0584 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:245acc5f5bcc8d80aa39583edd55ed1b796e7e1dd06ed79d45b47ee338130dc6 +oid sha256:cf05c36b2ffd950edbc5878e88a2772b4663b68154f647840613ef01146f0af3 size 821412 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp index eafe19d78f..4406a5cfd8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2335599547f6451d9e4cc0f8e741e0ffc2eccb8b3aab88283ea70af891a5f7a1 +oid sha256:618f56614a56c44a4602ef1fc18a093682499a9d9303c2a8ae709a2d64ca9e75 size 867118 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp index 77cc672fc0..c329682d0b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:626d334afdc2987c6bbc18b18009bdd37c50706d0d3ea88987aedc7828bc7dc7 +oid sha256:80b301143f8fd3b31c01c0956eb2ae26434bca8ad8706f3a8f2f6633a4cf402b size 595405 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp index d9c3fc3fa2..c3a73836b8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b59b1d23f8e14b41db3ea0cf693feb75b45026206735d7eaf20758e3f1ded4b8 +oid sha256:ccd4eb10f3e94bcce6b8ea66de8712b8a6fd5a7eb009935f8fb898034b033a7f size 554801 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp index 933fe00dea..19c08bb0a2 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f793085131a5af9c33eb11ab61a84498fb1b0918ec0a613a8cd17e475fa0916c +oid sha256:30e0cb5f67565b98f2ceb44702ba7c5da5776cf31d25487e5e56e7e7c1e4a798 size 582129 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp index 1403ccb560..6280980f80 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c4abc8cb5c2118f758019796db5aebf29f2ac31b31e91d56341dbad99fa7f718 +oid sha256:ca4e523b237d811135bfbe803b6f83175cbcc02ee3dd5b6b3f30ed6d804ec449 size 546903 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp index b5ea37383d..1ff3ecd942 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:29c228a2f687be3c804c2f9a8c79a7c3f4d4393916dfdd319e99f7d7af33e45f +oid sha256:5bb73e1895659aea2857adc994cbf08b8916be6311755fa5641626b718514280 size 853266 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 8382586a14..09f795b2f8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:28f6c48bbd60b588761217c3198e7c37e3ba88f544542c987a2dc3ed194dbe1e +oid sha256:01762ae4b0fdd03faede44d845352a705823cae793561fad159df553006829cc size 877514 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 8945a6f5b4..9ef7118f49 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5b0c11a9e56098f6573cececdb116eed5fb0737f7dc8e9a5b14423730f50bad6 +oid sha256:82ba478e3044f17cdbf9257289184f39f4a44dc919ccf9e3aa6ac81a1a3a3c2a size 602291 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp index 58ff6099a3..b5971d2200 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:01a431ca484f8472f28cfd9c759c503c5454cc10734d21ca62ad8a186f4326c0 +oid sha256:905c34fd9041b329b75b2daf7f6a73233e6512bcad07a4eaf1260f49c95a1302 size 501841 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp index eb0be22b12..bda8669ab4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:352d1ee591fdae39985794e3a9cc4a6c9eeae0a1976cd868a72ea14bd8681f32 +oid sha256:37d131324fc34daf2e5dc8414cfec961b2cdb3a777bf5e0dc63e7578a8089275 size 569137 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp index 28a90ff7f1..701546d954 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fc99bbb94c61a35a59452a52a19c1d68f2d9e750023b86d357c56eae533fb040 +oid sha256:e5b9710048bf50ab5d71497f4b713be20d1ecf6aadb766c4a16ddda7850a954b size 471697 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp index 9d34e557e1..da3a23ce95 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:22b5ecb0e9aa0c7b75474f29067038b96c7968d512dbd1df006125c3a2ab2ce4 +oid sha256:aec93bcd7a94311feabbe4974c7136cee1e180ad42fd1ad970b81fffef8a8dd2 size 620106 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp index c3d07a5f15..1fd769fe93 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d96ea8e54cb1a55a7fc1111b59a1c5d9739911d4e7f398e0d4f4ad379d2175be +oid sha256:cf22145b450edd6790b587b2182006ef3345d3d0ad488d09e6b172b0e17481c2 size 580341 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp index 1bf19b2354..0fb0d870ff 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8045a5d0ab10b4ab1de74be1c6d7a4f490bab208c5f5b14126852d186cbeb55f +oid sha256:7a82412d160d6089eeb8567b8d3e0c687c7fb539c44f1bb408909d79cb70f56d size 606829 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp index c6a9955fe1..8bb5b53301 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0943744ae7e2b40a98894fbb262b9e9e5dce30c5e31fd3f391565220a38fe0e2 +oid sha256:abf2b1d2ccffc5be31a448b061073d28b7ef887da6b3d30072c2b5bb5d32f0f7 size 571553 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index b613e6b0cc..27a81faaf1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a7a54a774e0abcaf4213d6fc318ed09484970ecdddeb87f8e4fc4d2ee1c87558 +oid sha256:5f4efd19df81681b7a5546848383f8ff992137042edb8a377483f013d26a6b39 size 927104 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp index 68aff06b1f..c176cea497 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6727f94754aac3153397dc26beab9f04871667b846a8199acbdcfa21b901fab8 +oid sha256:70acf74351454f7575faf4dedbd855346668df4fc5e3d4e7b010e3578a9158b0 size 885428 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 404ab576cc..5443f2dc95 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:31d8544d5ddaee042691d4b30ea920060e136b972f6670b39e5e70925bf3a683 +oid sha256:75d7265527d71e9660441bed1b13f2f468de7c23d0307a7205cb160f47c9110b size 746800 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp index 4c030d6833..e664f04ba9 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6c4cf4eb6ae6bf94abdf2280dee39675efd4994556104a071e7347111375f60f +oid sha256:860d8dc8c4c469a8fd3bc5b83f343c0a1cd8dc5f825c9e50eb0918fa44062a8c size 839892 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp index ed2ded6d2c..1cdcfdd01f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a89a6626739c3dc3b71d7aea2f34164fb79310f32701ba786f271431f70896d8 +oid sha256:ef3908d9a46a759dd017e8d0c2ded67dc27759da664ca169047837e047aa12b7 size 761798 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 85faa1107c..1490c92082 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a2c9614ebbae5b625e82b9d84c55ec1a3663cb026944e6d92d08596d7ea90ec3 +oid sha256:106407c51ab474198799b25fbc814729883cef81f5f3a7d0b45dadf2566f763f size 692680 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 8b10e5f1d4..a7ca63ea6c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d4910e2d9538823826a67edec1295b4f6a8b1ab16380aa9b00e22def9f1632d5 +oid sha256:4960b04420d4a1ce44a75b60b1c82bcf745368568a6ebbda63e9ff66f4a6ce82 size 952386 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp index b91d58c205..e5e87f80f3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a83f859d5ef68407e5924eeca5babaa50bc5a2fd93bdeb3a82cfa090e626190f +oid sha256:164a0e00b16c1ee927d0f7a166495a86b80dbcada6335abd1081f976f7ae871b size 627092 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp index f1a653d966..82f7309d3f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:40c867eadb9b4f499c8daa879f851edef6a35c6cca2a43f87582b006f5692ad4 +oid sha256:0e7a4db74b5fa63ecac1227c4bbf9d90f499377d1201b9d1f11cf9715c84403d size 524025 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp index c4808178d1..ecd06c8283 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:12c358e10813b244693581f7d5c3a1ed9a720b59561fdac1248078c2041541c2 +oid sha256:c8745d57deb568a34685239277c01f1b8d5ddb291acfb5fdeb89585319c3e558 size 592655 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp index 79871c7307..3e03142a29 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2624be17273ebd2be2293b8e8c1b0f4cbb20d34fa4a9ff6008a2471a1f5c3670 +oid sha256:7847a98b2dc0ada6f2b3c3ac1d6666b26a3f42e4d51c99e9d0969f7221b7652e size 493831 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 9713541918..b83d674867 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b1a186f02ccaedbb06447d7b5d2029166cabb6d3f05485a1367d84bc380a6c42 +oid sha256:db30085e6ad87df6d722d62c929a510002e87a792b5668af12284a7c0df5a32d size 695302 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..a0d9450105 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ba74c1846d2aafa7ce80d19c0dcb3573d152566ea1cfc53236db9c3cae37cbf5 +size 716884 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..cf09d82f1f --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:70fcc001595ee1a47c76818e0afb218bd6612968d36dd2f1172e356f20952b23 +size 613521 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp index 8e983c0948..358329e36f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:75e175802a2823c78ad13633ecdef3d8f66284fbb5ad3daa579a9d8547e49c99 +oid sha256:0b77f040a660dabdeec21bd454c49248a8f218bed269bd631e80cadc5619406e size 691204 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..86295c6335 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9f97c896284538f56db207acf127439658839fe596050448bfce72f299a113dc +size 710468 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..da409498be --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:168fd273b6f98e52c0f73c855e24ce4afd98eb9c127abe837bfc6a47baf86167 +size 629554 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 5c3fee3bf9..8fd5bcd3b4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:32284e6fbb321917273211e6c475fd821c7950ee4b8d31107c79a6a107de39a5 +oid sha256:fa0d4860e94bda6d54623041b79e97639dad99918be4f594aec95c0be02b0005 size 760798 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..d0d11cbfa3 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e3e7de0df52ae4e2b6186cfb38ca389da7de2bc3aa93af5202cc2e47ea07807b +size 781690 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..44f4057a91 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6aa527b75efb10ffd94d8be215637e640d6a3540f772924ccce69487cfa94086 +size 684200 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 5336a53125..32ff4de2aa 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:acf10c41378e6c1608f57568fcb837ab60b02ddc8cc7a083b34e6d358eaa1c4f +oid sha256:2e031cd52ba3a05dc1448c0b602e5d50dd1a314d3fa31273dd7a0cb398f569a7 size 828528 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index e1d076c789..bbf17241f4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2aacf5ef680bdc4f7d4df31dd6094af01cd1dc078e0f1ba8ce36aa876d931ee5 +oid sha256:e14929df707211caabad30e9db58e9fbb62a12cf7e430c7cf3beebea465108be size 802948 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp index d5b963b4dd..1316fce347 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a1b247a133fd8ec57e99f80b94b8c3434e36d0a49e84d635cb7aed80363677bb +oid sha256:b71954f682c7fc38e69ed3b65b7f33103ec873068413c31ca614c20e44d3b9f5 size 769858 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 66c4d84347..653b995d71 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:41d04938af73a7dbe5be33b0fa4c092cae7a322106de9090f8e9c7b19d6d79b0 +oid sha256:dd8cc69a29abd91f6c4fd3a675246937b012fdea9393b7860eee8454b0bf027d size 835568 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 5b54054d66..d633d2013b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c1375f8a05b1d8bc05adee43d0312201aaf3537d5289bc8058a2bf069cfbd1db +oid sha256:1752d965fc7e4c40a68f1dd53c3662f84fbee761abda753a13ce492c41f3888a size 725350 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp index 3d37785fde..337af92d3f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:53d189915831a20393998aa5f86c1abaa1beffbee79022e5bd917b81e02849f3 +oid sha256:604311f30afdaf68ac49472b5867ba9e9f48847ac848057a2e8ac0ff1c3b85ed size 925454 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp index b82dc5733d..ab8649e884 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d7f4e793485efbf3b1acb5265f06d8e080c5dea10ee53b6f3af33f56d6fa5051 +oid sha256:e29db872b6af92a50e7e679fec588da6c3ba03fd458d302ac57e62ed747781cf size 716922 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 1e24e1815a..844e411806 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:43e2dd864354833d1c6b0e5547d87d4e0c48e2a3dffcfa2f1f7dbe9fd1ecb790 +oid sha256:26405d33759e4cdabf26daa0411034657889872caefce11dd879b1e48360a35e size 766500 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..51e49ed441 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7f1e339b61ff546dfaf883f14d970b2085fb82df034332843197643baa9c43d4 +size 822114 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..a9daec840e --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a687de0580b0933d0b03dec9ed955090fe93627367bbb72aeeb60e1068cf125e +size 725560 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..2a70ff3545 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eaa35d16647bf93992fa0cf7c53281e564760db82c76ac5e75aba1aaf34b7c34 +size 661858 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..66f998a22e --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a78fa402ecebf656736955a04b4accf32812f453f0783b8b1ea6590572c2b293 +size 625498 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..5bb7064a5f --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a576220b5e64ed2af57e874863b6bc810b6ec1b15d9f2f48183658b764c179b3 +size 644338 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..1cebc3e11e --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7c2d6bcfa02b12b73e08e83eff91890010be545f5af1c77fc4412feb4657289d +size 613601 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 4497ea8286..1bd82d143d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d5dd2c319dd3d75035e599ccdb17811bf67c6730af63137423924b6ebf68fd31 +oid sha256:db9e359f2b2ee7c55551c03545ca08d04a14812788ed4c14b9c7b9cfe954720f size 802996 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..17f9668de6 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4347471f93c82abc4e973dc0b8c778f935c158ae2ef0931fa0b47668fa3fb9a1 +size 825960 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..ee651aad65 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4ac7f61779c1c78050c3a6ed0b3bf72ec2626d8ccce7e7b87812deca39e5602b +size 739570 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..9220620fcd --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bd483dc09f2b2fc68c20af45a9fc7f854c4c1db992a01cc5e399cdd5a557842d +size 662628 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..fc0ffffc3d --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f6f5cf6e86fa9a9f3a3fcbfb834d4e076964dd9b06f4807cb2a50d16d6c6dc0d +size 564051 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..6c02e1d859 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:471d78de818b804a92e25b77b30142dcb73dd0c1c667ec868c0318077ffb39de +size 626266 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..c25b413027 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:75e2ea93b5faf4ef392b5a193c679781dac29320e9fb95c984b1958984bc3a74 +size 531833 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..1834a0f57e --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f3b11e4b36970330e1d42dfe2eb848f8bf49dc65d53fbe570a7d761a37a7c5cc +size 688138 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..18e35ab3e6 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:01fcfc04552677a6d26e04c9f0760213414a749bcba1a31e29e832498f58cffa +size 650938 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index f9779d7b48..66bdbddb10 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ffc9804e458a782e164588e229b0458ff4532abe860ac1cc28f3d2cb39274954 +oid sha256:4c3334ffd4534f2e797afcdd57dafba90b1e4824505c2d0c8df5ceab1e9cbc13 size 857172 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..6618526fc3 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:13a8b1fddf05632c0d055d278f335cf500069b4823907f7d8d371f6d01dea471 +size 669828 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..91d4202127 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aa19fc1e7326ceaf8a252f962734f54a9a716ae71a0c1decbabcdc3c2e6571dc +size 638450 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 986e480c09..5bc71145be 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f8c9f0023884872038ee1e21e2d94b5d007b148d0477d60475673cd1d3139e83 +oid sha256:b988c5e7af56e07f7b43aab4edaf9985ccbac0905a67df8796bfb6dc95a906ef size 876784 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp index 037cf584a2..f29622c0ed 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e16af4a84718af2e07b8928ccd815e37758d66b572955fa246b609713c4d6bb0 +oid sha256:0a44b2b886b04a73b607a733d8cb03ef7c8e292577c538d6c5ce5b9efa476b32 size 817200 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 9220e47874..16d86b84e9 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0ae1ce7929fa32929605d2ac3506b39400f70a150fe3d7dc34489bd7bddcbed4 +oid sha256:ec87f52151582f258b03331b4296817c85cb334f6601fd043505f9b0dee9e0eb size 864806 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp index e61b7227f6..ee3cd44d5c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:93ebe2e23b0a390542b8d62b0f2728c28126fe42c4e95912907526ca553de228 +oid sha256:f2dc4c016b7b7c77fd2307f96f485ac0d005fbc544d4f0c3744f87920b22ed54 size 961696 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp index 6f71eea390..169f732048 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d18e18c213d0bb08482dfb312370b577ca26309870994bca0a8e0d6f0f124fd3 +oid sha256:2d5a9abf08578268cea033cedb593b8352ea21b7a38d2ca2a3a477ef7cf991a4 size 751832 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index d63ebf1db6..e0594b34d1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5259ebede811bdb9d6aa8aff6f5fa4b528233975f3ce24c857181dc7259333e2 +oid sha256:b52deacbf54a359b1a304c5642e46ca94c1ced4135680b4c6259719977c1df1d size 807774 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index d7231248f7..361fb4a6e4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ada6aa889cd0c993fd947ba9b348d52033ee7b9a3769b26db5e05050924712f4 +oid sha256:b7e19edbf416eea951f03ffa0b0236bcc04fc8789cc0ad0c1389b363482ab2d1 size 700318 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..e2d0719336 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:072c655f877dbf2d2438ea6f9f68156f21796d5dc690269b373dc04f5b078ddc +size 896640 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..f737ca37f2 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6c9110b2a4e15170d44c21492fba62d791c98e56740d2ce9adf726f9e8a9e702 +size 793672 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..4b1418fcca --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:042b9002ec7e8594a8f1cf43ffe870c5608c2add1a9ddeb3950552a43f519d7c +size 686242 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..95b8dd1905 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:40beff972cb6b88cf0f3c7af4c1f15422d0a73ee00d190ab462328fb3279cede +size 585593 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..6108e7cfbc --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9837ed51f2cbe13252fc7ae9bbe79bb903c53031bedcdb8a84daee0e1087e321 +size 648550 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..50b2337a33 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:309b570ddaa1cafbf995f4c7fb71676cb7e78d946613e4970c301d793307b8bf +size 551799 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index b9fa2728ca..5d37c0f157 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:04056748e48b2bd8551a7862ed14bab8f20a74cb9833910ef68e600f31470cd8 +oid sha256:d938eb83b47d9bd8b5c518eb84663118c6e445013cf96cd985e816ce6cbe8b42 size 689330 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 7ce0977ad2..ecd95af83f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5d60bbd10f194d17a822f49d1871e8c7a6bd85ad06e11645e7b8031a1d739619 +oid sha256:571c187df15303571db41052a007ab709667d197aa9c108bc624a9ab76f94142 size 735234 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp index f1f7a62c96..70e4f65467 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6371453e7c8f6ecbc895b1cde8c68ba7c0b96e34cc40eff8bec6f0af15d858eb +oid sha256:f03b508b5639fd6d65b77537020a5bc935177a01ade70c1336cea0ea8a7a3bbf size 690808 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 72b5713829..39f8a66b3a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0710b1cd52f7af7d998e09f0a56b1c8bdb7863eef70e7ba7003c57405031d69c +oid sha256:838a6651d9543b19338e9f1c5a1b7be1a063f039300ec8c315db32193707849c size 733406 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index f3d143ba45..3e2c06e8c2 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:436ab39da49e0dba2c872b800a4b5ef4d2262f256f282d200497e65dcda4645b +oid sha256:236de1e9f495cc79d2ab3a5998a044724201684833cb9a5f2fb315a616a1e080 size 758084 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 4ff49ebbbf..3aaab8a084 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6fe75c69b5aa733268c9b1dd60a3774a02a0e0b7c3aca6a436c8253e59ea5a16 +oid sha256:e24d5fc5dc9c714de2a1827920efa0cf02ded1dcdd88bd74597e8c8a25cbd199 size 804184 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 6a9e9bd1e8..71b10e4165 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:dafcdd8c7d1c4d2b32a54b077bb1e91f5d222e51897750f42bafcdd710f27437 +oid sha256:5a55495750df2239425242c91571f2b7ee6c832a474bd0f4387ed1ba2ec52a2d size 778674 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp index c72a157491..e09cfa7b4a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:66a1715f71f0bc5622622ed25e52d3514f4df1be2f1213e841bbdc798a7b0a79 +oid sha256:7da005e17f9dfa4ab3ef88c2aed263ffa9987fded29bdf371b5b0bca5532411b size 743956 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 910ef2e7e4..e3ee142420 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b6d4de26bc44054c5dbc64f906750858519f775adde6102262691849fc043b9f +oid sha256:2a1f701871be6b053313f3c651ad5d7495eef10fd010a10c07bd2bbbbca2369c size 676516 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp index 2ac24f9aa4..c4ee621817 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:67fa7f61a9b368014bddeb34f6240f8323dac79aa110c819682161830efc9b85 +oid sha256:320c46d115f463443d0fdee9d68f0367c433cadc000dd5f3242cb2afad524d64 size 752192 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp index dc331b0b31..4a93c0b933 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:23905d49e7694ae47f35f0d6bbe230f6bc2d444611a73afc0ccb9a8bcc19ebe4 +oid sha256:b5affdc8119c032e826c7a2a88013831fdf1ad5c48689e4c06fdbb3c945becfb size 672124 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index 54fb5f7d64..f24f513072 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a214a3c60dad427b2629e314df47a2b5b5d78c12ef1340712db4b58e6710f211 +oid sha256:47d9b7b2a521e3d45e9d9e089b41610f90a30893c593dcef7b9b93a3a9c8f3ef size 584351 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 7e78408f30..5b8305edab 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3fbf48f0f9e4b03a23ffaeb4fb5716c086ef2c2d3c65a40675652ba044593957 +oid sha256:bc83381585c11ead8dd3b412cc47bff3e6a733e71306cecd7529d104e9355d0d size 631176 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 30c3996844..fd8c52fdf1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1c50a26e3d4d1589579abf33e915b81141737cc6f322bae53af0784138a5c42e +oid sha256:d8974207649635339301cd2ec5dd9734cd6ff62216689608021dd568f085fea7 size 818462 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp index 1f7f2e293e..fcb189d5cb 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a10a6a7b359b7e1beda42f9e996dd3c57709fa83b78147eb52dc418c46b0ffc9 +oid sha256:bd6d5f16fa8de1b011062f86301dc145a095865dfe3d46980972caece07fba1f size 726690 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 6734997601..0e55d02bfc 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ef13aef508d81bb9eec24e173579492c9d58453204891ab5a75014e39cfd4a4f +oid sha256:8c9852e393a14df42a38f39409c7d80a389beb54be155a19e4bbaa6986bb69d3 size 642666 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp index 25dfb1a187..5328ee662b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c757119be60f18208ea33914442537b0512d8aed055e9bc17b53cf83fe50854e +oid sha256:3a9e5190852081dde0e9271e7083648c999c02603ecdf76999ede36d39cad152 size 773284 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp index c54d02be88..4d20fae183 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6d88b747b80e2f486d7b08009e2ccf34bd83bedccbcffd94bbea1c2eb731de7f +oid sha256:a6e3fa68efbcdbe260bbd308d6b8c50f4b11f511c9cfb4bf1c6bf87ab62b8577 size 685956 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp index 07e466d8ca..7783c337ee 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d3aafb3fe4eac2385cfa7f8ff66e31f281ed90ee6b34cb478da4317068242eab +oid sha256:ea41141c2a8e1b47ebcbf69eb51b2c1cbdf3c1e318115f1a553023c868b9b0fe size 566741 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp index a64e3723b5..6625bf3ac7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:20824666bd37435d04482ede35b4876150f7d2f19223acfb8d527dfb975cc8d4 +oid sha256:10858836352603b23e0cc51acab0aa131b41bb6ebaf144b196fdae8a60973fba size 544243 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp index 97572454b3..adb9561b85 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9455aae12325bc13b49936d86ec377d1ed4997e1b5e665e52745f07939491fe0 +oid sha256:045e3d27ec20752981dcf2a49ea1f683ce3cd01513cb8b69465f867461528494 size 557607 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp index 8949b24e68..aea5f5b47e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:988315e62084ec14ef14d4d9dd3e18fbe771f33873573906d18cb98129117f54 +oid sha256:91a99798780abd362cf078c002b03a051f06c89eec2a91fb468383a36e92b945 size 536393 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp index bf2732ef1b..2bd6cd0960 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:78f62d5de65919627d05b99dbb0d88ab0cf538713b05531bd6ce9ae11994369a +oid sha256:b2eb356f76d3822012050305e6b44cf0b5a89ea85ca31128d6ed8b7fd415cdb8 size 769496 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 122d1a4b02..e966ad247e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8e097c1d8775b5781ca45710198f46939316855bcdde6ac0de02f36db150a641 +oid sha256:d998de74acc9514019d876ee11581a2ed6c1b72e84114a13594fc097a894e218 size 815796 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 5df6d8b8e6..cf884a96c3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f3d86bbc91c808deb027e3b85a03051716459f564b21b5eda602b3cec961ae73 +oid sha256:e003312edde635c20f5b881b1c302d4ab4e41532df61a425f13a394f7fc2295c size 585517 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp index d4fc46125c..a87a9644b4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:856b22edb252c881c03fab49ac0cea8761a8e90647f875a17f3123b23329b803 +oid sha256:4cdf1a3b739181ea0f4ede4bf83b4a078793b081f4a67d30d6862ea18b2e060e size 484129 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp index a10c97911e..61fd4f403a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:88d44a4326d518f249ae45250721181863c16d2465bdbe33a6b15b7962f86852 +oid sha256:027a83192e9c982ed8663b225944509d8ba3235fb85abbeb7fce5a36c52d41e5 size 558579 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp index 646c3afb3f..7d8049ee5f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b9763452231f480344108c9d6bf59d9614318f509c08111351d4bfbd2aec56f1 +oid sha256:76df039b8f3c99174f0ba214e1fb5e8f37863ccc334570769306b935bf4e5e17 size 461187 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp index b2921e6b56..89abe67f7c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:25056800565159f731008422a03b4227b1578bb9d39b922f6497272575853bf2 +oid sha256:f7fc7088d27d49c83e8693fc0991764564c9d770d403cac2dda247b61a39aa14 size 593019 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp index de4121daf9..c71d182b72 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:17b2586ff24aa976bdcbf53bee7184a2aeb934d182520515431a8c8fc69f4490 +oid sha256:5cbae8f637b4e31a4cff8eee14d15a502c1474c9b5a5f74b0539228413756c16 size 569781 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp index ade22cc376..e5f5407c36 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b39404e16f0f1e91eaf7f831ccf76b242c484db5fc6bc758f5f1b021e0e78052 +oid sha256:4d4786d82e02f09e4ede4d031854805818b4e2c56e1ecf19b9563b2280004f26 size 583097 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp index c4609c0ff8..5864d0b164 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d9f8521168a520f6b8f0d07f6125320a2a38cbd979d058e3b312952d7dc6ca1d +oid sha256:dc3ca5d0865e84ebb250aa8df6551c3daa232b6259d0aacfed20439a4433c0c9 size 561043 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 8539deab4f..69410c31e4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:074fb3e6412bde71f1c5cf7da183af914fd4088a99507a39974e02d6a17689bf +oid sha256:fc12315a00495fc230e70c84d6bdfbd734486f6ac15485b134fde9dc88c8cb19 size 849894 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp index 2ca3329ea8..7d34f2a1c8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:420844aa5c75e7bb4683f8adc059280341b1cc8001112b45f39b29d8ff2237da +oid sha256:4e22fcc187df972c9ac51b79f741a5c4684d31bffc67d6a9bfa3331114c0f620 size 793766 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index cfb56c3324..838f9e414e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:955bf8279d882dfa5944d39eb17372d33f09f66349637c3cfe99cafd79f51a6f +oid sha256:76e9cdbaf34b1817563bb9c39e0a779536ed2a25df07aa8bbeaa61e68cf11ab2 size 705852 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp index 4f37eae77e..600f4819a3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3e533f5b637452a380e8111254806809912f5f8944fb09dd01d56c96b1588883 +oid sha256:70c4c0e2eea823dc591fa5748820b4fdf615d35986bb035a0664cbf3cd58cd41 size 782960 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp index 5cf540c066..57671c50be 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:230f6368b60233203c9e5acf707bf35be3842833aa8bfe68e30ee0ad0ed310ae +oid sha256:213eabe11ba6876de51237e3880731b370f28f6c78f79db4b450e12fb23eee1a size 709650 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index 5329cfd390..9e1b605fcc 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:037fb4d593cc0a5f19595a852102af1cacfe03af1dd88b53595e543d4b2341f5 +oid sha256:4d5b730d5810a66b47aaf423b86d65e9f53e3693a909661e449003117b515888 size 610383 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 9c83eeb32f..92755dd1ed 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:32d2c2a359af8ca7e80a9169f96d469eb02ee407ee63b285fc9df34a4f3a3f0c +oid sha256:b0b700849a6436417b9b61ae662154936298b37de123d05fb89e4de0c84e873f size 665446 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 8d07d0843b..0074117b28 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5e8183be19a991a9a6cead5c79409373faf570f4c654a5a30a0a2c96d31fd4ea +oid sha256:641e2e17cf92577b995d8f6aa44ee96e7ed8dd3a4096a70c1278a59c4c5d50de size 889928 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp index cd74c62900..9b1f5d48e4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:83bbae2fe4204db5758281b53b91f898755a2509f04bcff36004b05fbfc6cbb8 +oid sha256:bb9331dddd1a140ac1e513df0a025a2e41fafb62f15f18e50bd6c8c79634045a size 610315 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp index 8f5c608c0f..2202575d7c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:489ee7d671d123b9d765cbaa1eff59462217e1c509e5a62e9a926a8c9c2769f0 +oid sha256:4b7a41a8c4fbd3119098e9e36a6ef3e89396aa021092051505cb1fbe454ea60b size 507843 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp index a42f7c747d..603b3cecb2 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:68d5d620b4ab6c4c4cc3b688d0677cc2ab2058bb0f19434e37697d29be879595 +oid sha256:7dac587fd00757f73c8664c322080f12726a41aacad442653e5e68112836a89d size 582145 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp index 12344ede7e..dfb96e38a4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:865f96f726f7fe0384998580e7cf714041a21137f343e3d256cac06017175914 +oid sha256:d034aec454bd63081de8b8517f8b566404302c57f043f46874dfbf3bda92c1f1 size 483321 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 2bcf5df5d3..1097f6f042 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:430092244399fe87ef09799c3356d6c7693954bf7284dcda1327c382dc1e0192 +oid sha256:1e9794760c8a6921a10f990e2582ea5c959960d3769cd7bc875faaec0c0f362c size 636826 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index 7abd18eb0b..9320346e0b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0e0725fe6bf6b2137cea2a47c98f70fbe06f0d33eb847de628ce9c63ede6b049 +oid sha256:e183dfcc9629c6d8c76781982cfa37e86f7cdab6f11793026dc8f59998760e31 size 567135 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index e6840f679b..775c6858b7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8d147e0bd5ec7352140406cf21aa571d4886a046dc9c5338ed30568de21bc98f +oid sha256:ca3197ca8ca2fd3cbcbf7e93765d2a14b2273d5bdf5b03ea948719ef48a04958 size 486041 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index f1dd6c6232..bc57aa306b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6aa35dc05335bba5c414b9935d3016b4e991d590de45e308d4f34df7605d1049 +oid sha256:41b23aba4a683f84bca743f15edfb14246704385d1bf0bfbf1ebbcb29361877e size 455527 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 2876f21782..0bf8dd6db6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ea7b100ae9d333cad50fc83b4e16173630aeacb2a4e0d30e54135de4415cd981 +oid sha256:268c4696907abe399bae64a082daaf5c2ff82f11848d4adaca4f581047e87b9a size 475305 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index ea7b69e93a..c1e433f376 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a0e22da1348c8d05fefb02a40fa639377158f426517dbfc287f4e13284cc6992 +oid sha256:298f31a836f5962fcf594ad136ae71663f31d9b9b2f9fbe47a4f290d21f2fd66 size 451131 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp index b17b58f3bf..68183f8753 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c2994fd613b90d5eaa9d32e28e7f12bae7afc0fe59091c1e73be4d49678de0d0 +oid sha256:0d83c9ba3053d39b8f9085e2d841e52ed7132a668fe9abf6517ff8446c82f73f size 634160 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp index b48603d362..295ae4ae69 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:18bfe085252d4ad28c999013b9000a38d92f29b963b0dc7172d45b8f5a9212b5 +oid sha256:5d1d2aac92ac6bd39e656e7f4d8f5951d1fcf4454ca26dae282ed5237ac1a67f size 569549 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index a145b7cc16..3d1bf2785c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:98e41cd0330bbed4d65b20629479d60ac58654c39e4e3469a41ba2c744bba361 +oid sha256:ce9bb5d6bd4f237af49d10262dd0c8cbbc0e4fa8346b6372734e52534f9aacc4 size 455903 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 5b1957d758..ec0be29be1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:003d475aaea8bd2ba7ff067ead1284e9e122324af893a55e1bbf352377f7370b +oid sha256:0fe86f5e67a5f4b5126e78260eb0cd36c2562d5830bb948bf548968c4b83660d size 397139 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 1717e49e8a..0ba289ed68 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:61159f6e34b6d97ee93b8b629796bc87bf03a47a4c552ac4203d3303f6daa549 +oid sha256:a5256296b50066bbb4370480005c39472c0be07ed9af1dbab349199ba5f48d0d size 439621 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 8395209f82..ba895da752 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d817efd23b62d53b90a55be4cfad24bc190970544485d134603ee75b512feed0 +oid sha256:8352838266dbb462c5a8143672b4c769fdc4c97b848ca8ec7555b86c05a681d6 size 380857 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 61de79a944..0d497d1905 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6ccac57eca2c720640cce8e04ba79db9cbfc6ac497d881531386a572ade1fee6 +oid sha256:c3f217702d26c68da7e6d76ec0386ae19a84195e35d339c21376c6ac16df0f04 size 508177 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 19589e4f51..868908a19a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d4825b0ef08f93a0f3f19dbcabf908611930df902d38828c4cb186ca9d9805cf +oid sha256:ef041dc47da5380b44fc9a8850f48dd3a66d93ab91d85aea253f45c5881cd358 size 472137 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 9a51d31394..261a010747 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b277fb5583f82260dd1923c26b2812b180f2acb25d52493d9b27eb49c7d50ffd +oid sha256:e79f1ec1c1d2cde50ced951c90d5ef309c46d299b2de5cc3461c2e30d5ff8a04 size 497441 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 5794d06045..3d7985b2b1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:38cc86dc32ec3eba31f5b07325dae74e88fc5deb5fb31eafc43bb16330bdf032 +oid sha256:5804ca9380486bcefbe4277a4f4bc5493f4ab3a8a557f67e6d5bad4727fe2252 size 472477 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index a1bfd0e3b5..6216e7772e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:bf5c81bcfa45ea3e1d90795121cbf4bee94f993ca1cbf7cd0afebee56a0ce3c2 +oid sha256:6ed49ebc12efe865e0ed9d3017188b12e02d157294b76774eba716257b0a6299 size 677214 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index 2633f77d43..242abd6043 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:508a56a4130e470cb9b759773650e0855255daa5f86f20f67c25b9970a1eefd3 +oid sha256:02aeef79f476cf006c2bcee58e95f7856d7828fde9fa8f982e99ec562549f105 size 608951 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 9237b76db1..32f4c3af64 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7f26f91967fc8443e4076373e92bf77cd0efaf32e1dec8edb1ad35dd63eddb6b +oid sha256:6cc6051ba0a201df41eec73be31c864c9d1f8b3d2c85bf71b215075fce6e3323 size 484255 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index cadb1ec4c2..f6374098d7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:28d04c1821119066a131aea17c8710937590630151ec8e75b041f76e70a996df +oid sha256:e250095162b8b82be5f61a4c97ed4f41c9c6eb2b37437351c46af5c650297898 size 414415 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index b81ebff3af..862753af07 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:42f6641a9d195da82abd190c3266cc95326e742f7a87843c789e7b8d14d4c62e +oid sha256:c06e52a73680dde5c8c6397591d2de9d8b3474d501386451df43d2b5611d37a0 size 464027 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 7a4a8f5829..a1067136b9 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ae63926478bfe77b9c733ed9fe51a95dfab64d2b68621e48105e481df2c89372 +oid sha256:9bd286b9ecbf7b7a775f8666afacd4d907996a9930d31a6f5483a67da1622c21 size 397345 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index a9187e2685..9fabaa5745 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6edf32d6948b8e1b841dea7575b6e46fb9f58f914c31de0762e4430626a3f57c +oid sha256:3aa6737def24d101c9bf3db4622342c20586b033cd23f94d1b0c36038499e227 size 667562 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index 8014d1c6b7..6c3557b463 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:79cc4c38e30aa6c6fbe1e344da6878b5e52f95454915652df34619e11a2661e1 +oid sha256:fa972e367859aecacb3701a31431353a63d2f979d3d2611441c52427054a6ae4 size 575817 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index e2d0a8a8fc..d296f546fd 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4a92cebb6745b88c8d3db4942f726e059f7dbf9d3a3760eed50f031dcfd7976c +oid sha256:09b2987882a2fb8dd2cb033af1b20c9134c4b615c3a46c7654a327e24a6e3317 size 598917 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 1e7da93970..175871794a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:59ba8e3b6e15a7ce4ff690cd18324edfed696d586ef907b45a5e84b45ab960eb +oid sha256:1ad2cba12b625206451282000fae7654e8b82aee0b02e0a8e65dcaa9221dac7b size 573927 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 16380a705d..2560c774a7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:49217c0f00260b06f80a09de87f0de4f3c2dfee9eadab2dfdc56576916fb6806 +oid sha256:90df126b12b1cfd15b2865d5aad35be9f3a9ad6438c8855fb73197ce19fb25e7 size 585023 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index c6472d5a20..3531cf03cf 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d7a98b83ddcc98e80d1e6e4e057775317162750a474379ac24050d38b01b592f +oid sha256:0bed40be9bea093bebde08f73127500eee644cbe2cdd01626c4ab89bc048d9bd size 565583 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp index b9acce7141..ce27c24c3a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c37ae6a9748ad300ea4d156fcb63697d32ccc4534bd0c4fa7967b55a8097cc5a +oid sha256:304ca95564a2c31574fc1f7465a8d3a56059b92a494e6727d7791b7d9dcdaefa size 669928 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp index 845a4027a6..1ff95a4208 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0f4debf50178023a3d9b33fea16530fe918deefc5ec679dc81c267ccb4a58b2e +oid sha256:0c75028687c23b121e00c35eed5b24259befd6524b8dbc75b4f577af75800b0e size 610669 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 17c0d3f349..fe7c284092 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:13d2a73d86e5033b8166d607ab5296b1a6b536f0d2d4a4bd423c014f1181cd5a +oid sha256:d8252d588aff54843371d5cdee86adeae0d8da62f60550d6309340dc56116e93 size 564043 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 91cbfba1b1..9cd3310439 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a5322f75370b28700696487e22f6d64819a8c2c719b146739f1c48b484af5718 +oid sha256:5b8f8d2e58c91f8df6c22d442e3dcbc8a220f08996560d2043f117f9dc9879f9 size 506067 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 04e20e186a..e96ee36efb 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:908ddca8f4fba0efa7289cd3ec1ef5df199c7c5184920ced56a66603eaacb1a4 +oid sha256:eefee97fe2192d6c814e9f6ebab13abb9032bb9372d29ad8dbc48fc713ddb9ee size 545393 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 5a31c6ce44..1c9727d0c9 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2deca6ef896c04f24f7a52a6150a2255095e725de6e17ad5f182579942a8718a +oid sha256:ede1df536d680b33e9bea49df9df02987580c184f2c07dc78632ed17228022f3 size 488997 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 3f2f3d3ce5..dcce4da284 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5901c034c1e4119a03aad0f59c552f88add42d532a6cce943e191b026b792727 +oid sha256:664d478a1a5f6fb27118112eda0662ec1f69abdae281f5f9dbf80da2335b1964 size 621842 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index ea281f54a7..eb02e03434 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:57b06529b3adb2a0eb17e4da60427e565cd676ae5454316505f7a077fac65439 +oid sha256:1e0806b817187029541999b65641bb52398cf50b4c4c1fa0e80f346f4a74698c size 596063 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index bcd575f9d7..07730bbaab 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:29f411650cf42af170e53244680bb21992a7e388002599b850ff85784520414c +oid sha256:e51a6439b1898cb0a260ec4d535804779e57af4faf15aac63071dfac6b95ccd3 size 608737 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 866447f4ff..3354c6c0d2 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2ff59e52c92dafd012ad18893355c647982b20a12a6f54a3abc9153bdd22d3d4 +oid sha256:abf8aaaacd511af38dac137b1004946788e12a83ce1c2072480cfb124463592d size 587719 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 4cef70af3c..520d55c805 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8198555bb51676dc3d0d1afad3a14eb44bb6119d110939727cee6babf0df5ddd +oid sha256:ac5d9667703f0bfc6ac965e6a96f363764529ec8a62166ca9af7a7a27a4ef1e2 size 707948 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index eb272cd34e..def31befce 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:63549e7cf9bf72187d21a8968f6a7cebff888d279b51675ec31b15414dbabeb4 +oid sha256:fdd7543a42485cc53a1f337abb1042c33f8aa84f09db52c796549827a02ec938 size 616993 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 9f8592895a..b22e1e656a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fa165ab8779021df0f101ef278a3b947112132e3d0638efce40fd42cbf0958a5 +oid sha256:9a8be870b93e47221492ac9f5170630371ecd1d8655f74b26e5781b7f228aa79 size 591605 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index c838c40bbc..8d0fd13bc6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:92eb1a2cd09e6dfb5c65beffa1244601d9d926f2e99bcf03ca30bc097bf7bdd2 +oid sha256:0b98415dddfc581f0d35166b13ef7ad7e5e26e2577ac207cd432baecf03429db size 518607 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 5a71f3064a..a9c6e85d4a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:37886a95524b6184523171c3dfa82a6172589e7468aaa447c77d794358de48d4 +oid sha256:a7149be6d6e6d2dd7bc07ad0b840aafe2c3d97d55bfc6df950678680b695c605 size 569797 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 48502acd06..635c21ef92 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0b3cc95e12e6b3cf6fb521dc513214595f0174f1c908e7654d8e9d506efd0125 +oid sha256:c1a5c6cf6472ab810e7a4fd3f3edd62a46ac46c9ff4b8d86cfa268ea79d4699b size 499957 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index dc562c2fbc..31147f18df 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fe0ca75be4b1d519f20a26a7b28dffeec91cae49a7d6d18bb5fbe7987f1196fd +oid sha256:8425b56058f9820e6d6b0a55cc52149ed7420716b7241e4bfd367f270d75d11f size 564427 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index c18f3f376a..21fbb26b0b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b7fe3bf34eacdadb572075abb26208a29a080b720ddb597990555748fffb23ba +oid sha256:7797645c990b3ac64a2cb41f2e3e82d9ed6359aeb041c3eec4f808a25779caa9 size 505687 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index adb6a59d34..2eadeb06d2 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f8319df8531261ea23ec21ceedff645585561d55d649d318f4402eb7bbce9a6e +oid sha256:e8b847197753895b5129b668a234c41ce36d488d0894768eb4dcec1ec7c2d3bf size 459425 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index eda232e6a2..e0f9bd86d8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8652290c74f5fd9381f2804eb8c0b26304868937b7f0d40058aa8b449930d36d +oid sha256:f0c4f2d9e8c72df82bf0fafca295ab3e9272043b2e02550e316cc5c175540d32 size 445263 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 79943a4981..82bf86e959 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:dcdd7ddc27fb6bc1d914d234abb5e77fcf27a2d5cd0312673b8163137fec5a52 +oid sha256:0adb4945626d0ef8a7f710b06b54778473bdfef4ffdd5d905e0ff93f16820fcc size 454189 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 4803a6c5c0..b3b5f45bc3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e2d0b4d75925820c393b706e621da92bfdc580f422eb4374fb4f0f9cf5589eff +oid sha256:dd0409aed7fd8e41ac9fb6219f6b4dd774778594e896053c8d339f3cf201ebb4 size 439289 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 1d38e14a12..963b913f0f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b44085c5fdf0c59937f022e022f638b1da93ac1f4114a3e087d7c7f83fc48374 +oid sha256:2172c538da679dde5c7a9ca49547bb5fe4762cd2f100f22014009f443f190dd6 size 563635 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp index 0e59d33a38..1cd8dffc88 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:efc9ab2677d75c0714889f16fa4284dfaa04407f8262556e83174a9d6e6b0184 +oid sha256:ac6f0df9af870e41cc46d9ab1ca6f5d06469da5e9a12bc80810ce1e14a92fc0d size 508103 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 376fb235b5..2dd0bf34c6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5f0c7370a00855244dfaf33eed0836e7a4134ddf150382d4418e1725246e042f +oid sha256:5962dc23e996c56a794c53f27ff734015933bd584d7f669f9c7cbcd33950b7a2 size 444259 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 868ade2aea..c99ed088b8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fabc4c838e1f7aa5c2e08b056cf7f22e5ce7c3f8280a511781b58f5857ead4f2 +oid sha256:425b70c17a2337ee6359621e818e0fb9a49554d73c408b16d3f44d7eff2852d8 size 387073 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 0aafe9c97c..37e400d6f0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:59ed425dfd7eb6edca95dc9b6e20850b49c3264f1b049a88c1959cc73fd381ec +oid sha256:72d90ec6bb9c64793359b7462e9f46c7349b5a341102e3503ad6ea6b17c6bc48 size 427779 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index e5c66e063e..abe41cbbaf 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fd47c28acbf85534a5aea069358954ae08fe46e03589c64f0b9cc6946edd897b +oid sha256:c0f9f4bf1ad3609d9316b91b087acfdcd5b6db23a9a7531495fe4881e3b8f615 size 369805 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index ea5662960d..53f2b2006a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ee1165d7377e123b407d6fedd8b8da7c1a35dd1cf204389668143c3eaab24878 +oid sha256:8802c9eaa69cae82b0ee62ab242f88e8f532af937ab09445f6ffe0ad61af314b size 476775 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index a5070906ab..9c9b90394c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:cf56c7d502ce0aee19a63798af8618e8df245690a2c1c0e343508675f2ae2aed +oid sha256:9fbdbbf75f4d6291f75f1f2ec2e0e7846ebbd34b7dabc689f2d7deefc61128d6 size 461873 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index efa8e28173..934f2f548e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c9a7f298cf484812098fd9b01513c2a4ef9ed687f76b399f399f01cab9ea2a57 +oid sha256:41a0f2b399b347bb778362c8729d153187039d5e2816e1dab466908be741abcd size 476325 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 77970868f9..6103cc9caa 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:51b980d02e7e2b24e1779e5dc270de27da3afc1fad652c23567f304ff0363050 +oid sha256:26c2f0ccfe9b306296da4b7fcbffb99ad285e940ae5a7c17d612df1bbee2185c size 462213 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 9df6485ac4..98cc12fb0b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8c0022826ed444489adec4203e9589f6c7a0a505e305f713cf3f1e40c2378569 +oid sha256:0a3e7e8c5fb5f5dca5cfb672d8b90a52da54c4b8917781fb0bc517dcfd9c2f2c size 604813 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index 6eb14d4c16..3e72820d6f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0037eba482622680f0de71517b5ef41b5e4da85c00f219166e393f07054bfee6 +oid sha256:00092e4c90ec8dbccaf62726ba6a2a85f122a24f46d797700b62910a13ff52d0 size 546075 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index f19826f5b7..100e636643 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:99786476ef9481b2e05ccf6d365f22a37f49654a4fc28c1f2452cf4de786154f +oid sha256:c1a09cfe5c830f03b0ed5c2fbc3f7b24fd466db4a89cf1e3570b071b984f4a08 size 472611 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index d256fe7bfe..00eee6231c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0a3f6cb0e50891b0c7da8d778094c817fa3ebeeaf1f2a23ccda1a2f414f28554 +oid sha256:bbdc34fe828b861e936b90c2b10c44b70e6cbafe15f5b4aa11a7f20756dae231 size 403561 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 4b0795591b..b98b511b72 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b455874426b56e359edfddffdfb3e29b3128ebf13485e134f0ec6aed73732174 +oid sha256:b170bfd6579ebe430fed71f5a41785d869dd9163e4e9cc9dc5d2a97ee7584372 size 452185 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 2673ab74b7..179c2c4eeb 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1b51567c22bce433e4ad4a889f309aa8111948afc319deb3d83b5564b258712d +oid sha256:2d2519f6a470fa4cdeb6787309ae699d42327bcce7d23f53aaf5e7b0c9d96207 size 386291 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 7358343d2b..de19be3c74 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ab69cc37cc79472b96b965fa06fb1e6383827b5ac8b25ab5e65d18c130c85dad +oid sha256:360daec624d9ad02e8b0b0091c75c9c3c0f5c3d9c29def9c95a48c85a3f7456f size 726842 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 2d533716a0..29b3d9e89d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ff68005b0facbcb9d1648f9e19e8568dd899c683a565194093e433c8844fa319 +oid sha256:4c43d61d40e2b3db89e889744861874b756e874036172e9baccb4d634a53d457 size 754640 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp index 50ef76d136..a9db3e333d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:de8fffbba672abd668ca569d56293cb09ce97f7886e98b2faf558909a5ff69ae +oid sha256:93186567a9ab7c15d1b821eab644217ddf21dc65a9dd49edc20481d9439fa51b size 731180 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 5c63fa7b35..6726df5005 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4c56106c32d4614d1f65843bc7f7210001d2710c27657d3539b37557afc7d7f0 +oid sha256:71cacdad9dd6a2b2a5fb5a125f70c0242a9fb60deff6b5c9e1f52d5ebd9a0f66 size 761248 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 8d85159a1e..f6fb595347 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:624aba3909031514666a7964ff35d877ef492ea4efa9db02b63eb2c15aa7b698 +oid sha256:d03ffb488547a5e86345a8836a2da74368a23d3acb9eca07c8d0ad20f03a8749 size 795644 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 938e8607d4..858dc8af12 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:213814f0538cccd5977b85fec3ab8fd03601810fb7edacfaab5f35661b1fce1e +oid sha256:d6d5b2d5498534c33f0314e4ec56a5bc28529b63d825e88f107520c437193a05 size 824380 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index b52f9df785..e083ef7d38 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:304d2b21e84998c306097d064d0646cad223ca2bb685256f20111d8493060830 +oid sha256:9c6c5cc8d7da1ee0fe4da47eb096c901707377bffb27d09310108d4a1733a262 size 826990 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index efb31cd00a..ecdc11cc51 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4b6a2e5396c7a0faadc6d7a91ce6c38b387722304ab788d2f167eb9f749d662d +oid sha256:803a06e1a7ee0965bfad1d90dbc205ed02ef5c68ad72ed94414f18f10a62ec94 size 850348 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 6d92dfa9e0..38ae7346d4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:75ffc3954193a6499205c7191be911c103f6fc0ec9225503f36f93076fa1dd91 +oid sha256:d9ece58964d263f72f79997388e26fcdd282ed38d44cbb97885353da552200b6 size 830146 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index ac89138c43..d8ceb881df 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c92684e7c99e409990601dda29f9d6ab7ec74e872b0b4d79ddfbce6fcddeb40e +oid sha256:ee74d4188a9e4f3a6aeddf198eec8a2dcb443f6af2e700e677345b5f2310231b size 855822 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index e10923d9bf..6e6f4e0b08 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:db6897028ad42b6220069c0ec65c5ff5ed0192f41a31971cc251eeb65d3ab2a5 +oid sha256:9e7347c083d25a6ed93c62d30ce8b76a69bd9497de0baf36c5594f04dec5af5f size 898802 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 02a25a8b3f..79153cbae8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5e807059a55c3c7a4ce12a8c7c756a63727e2135087becd30569ad2058f97081 +oid sha256:900f36736b38e98eaf4d4d1f35afb8867c860420736b28ad36f67972caba1d54 size 922556 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 7a3d5858f3..0021a237e8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0996f62ee3c0e695001fcce2b28a1a68ded931ae0cf982e3e854a4f36c9d4347 +oid sha256:d073df4fe7974dd7003c9475e443f8089a800cd06ce014933b017f1d4423ef7c size 689644 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..d9bda3d575 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0177d50cb8215621326bc15c51e60efaf5ad0d068cd2377890fc9a851b65d775 +size 710338 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..0762d9c0d4 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:168eb7d611e0be4a84f3bd5c5653d9b9e1c9560f6e9dc5fdc95357be9606a0ae +size 607765 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp index e921d5e452..b57c76b90c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:af4173f213a91f960af6fd69190219f342b597f8058c7af0236d803e5fdf6809 +oid sha256:5104fdaf6b866ec7ae3ed50def539236321afdb99a497c07bcd646044e8931f9 size 684758 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..f5d1d8c91e --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:523c1fc1b3b72133843159f891b6c2d27c6211c785893f1a9f14a4b77e62b3f4 +size 704712 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..72a7ec771f --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e18611213276154a7d4847c6bb464008f5af805339833de25205e9bdba37208c +size 623008 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index f4c9ca017f..e817f8e6a6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e90a0fc5f5de4f5865ae6b4f41f41ec2e157d71184c845ca21e7ec797744e7cb +oid sha256:dcdc7d20fce6d3f503e00f6686e6d8a63a287beb3226faf7d74984bd1ace8f41 size 755190 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..e627a46a7c --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:50926d15cb934d5ff342e67001a589f923f70a1d369766b3c77c02d8e2ebace5 +size 776774 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..9c0dba92bb --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7527bab59b80d65b653148d668d0681232ace79355bddf36b68bf803fa4ae3c9 +size 678494 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 9b16226110..d1a49c98f5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c47437926f5f84ae47ea6c1a298ef5200d528e8f070ac37028bf41a70ed199bf +oid sha256:a2be68b9c9703e47effe30e5523a1566c40f2124eb94ef051ab82a2a9a06a08d size 797292 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..548c35f14a --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ec09ad6de2c60936bf8a19b2a25611be0a8afeec3b52dd152edef09a8c70a94f +size 816358 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..c045f1269a --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4492d231eb0096f0a7014d0a172c7933c96b869a2b9e5ced554c4c8decc3971c +size 719804 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 0c4363dc41..4bbc604a9c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e4cb5b52a98402eadbbd1028f0ff1a2afd172e93901c8638de347d1f42aced9c +oid sha256:4515165a2f0801517651841117f653d5444f43d8d155571d1b4a86f60ff4f30c size 796550 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..a7c22e8e48 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4ddda3b0cf160a99320a995d2c5b6eac55e0b5b35f64c04ac9163b316fb09bbd +size 820204 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..1c3bb1b5b0 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:850c6b1562e40b00e7f9df61b7647aab9c2cf12da2eb4956cc7bf7129ebec002 +size 733074 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index f14ff7ae06..badf3b3492 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7d4898806407eb0336cec662e0d4a4a681eaa4659700fb82e1054d7ef8791114 +oid sha256:665d5e7ee4827454e077846a391669050ecb051234a67468d4c72ae1c3686450 size 869004 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..4a368f505b --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6fd71a12e3b8409b0d1c4ca5e7c40cfabface2d7b1a896d1a4775501467766d2 +size 889896 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..847a8f46be --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9b4ca018b8aad4df83364a9a7fe5bc0cd6f81fbf15c15a607c091c88f893d991 +size 787916 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index d638d1f789..794745b467 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a6aef438f630e18c957e922484548cd147554303e8b485bdca87f1220c5b9e73 +oid sha256:fd8ac7922ddf809e9c4f70f0d3d9bcdb7f314c5aa7e2ddf6259544751f6f071b size 681896 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 6f6283bdf5..edd519bf51 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6373525bc076fdbc2ed7fe021ebb49949b8ca8522d6cb312c8c1bcf547a61373 +oid sha256:eeed5ac9dbfc70ff35895828d7c504ada08303fa1654aed50e5beb493f90e49a size 714284 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp index 5e31eed8fc..839fd4d072 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:619f2c77e8a21d4542f64a7c44ac11ee3ccd363ecb8bb8649c598d1b82a86e18 +oid sha256:68a41d386c056283c0fda39d6b983252c09f3f50cc60c8181fdc40a4aa9234e6 size 681944 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index cc193d5a35..dd041cac9c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:db412f814f7523232aad669a0a69c857211b51401a52f867d6a0714d2e947904 +oid sha256:7e6fd0f6ee10f904b699f61241e32303ab3ee276a1dd8eb131fe9f4b29034e45 size 719560 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 10a2135f8a..d0edb28431 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:02a922f542f9d4955d1b8f9ad0e5e5a02e8d1b2e0f3de13ca82ba56b99eacdde +oid sha256:193e5dcd501308dd27ca7adc9b2a06ab101f85f2de91543d0da8ccf3cb016c7e size 750700 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 06404afddb..22177d9b91 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3e8fb81f3436285bcb0e56e2bd940509b623f60a6d30b936df60d334baecf5ff +oid sha256:d93777fe834ec7c65058495d9db76ac6fbfc200f67d44dd995511eda13639f2c size 783234 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 160bcebda7..6158a1af04 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:bd20e086ab2fb6cb0e62b6dc8229260bd5c4181654e9c1d14e00ee6db164a46f +oid sha256:aa33f45a24fc65171d2f2ec7e53ac818d20471692b688ccb97f3a4234a8a6695 size 767640 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 0fdba76538..d6dfe9e088 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:00bbb2825623c944028d17131250449b5c31c42620f5746416a36d418bd4072f +oid sha256:05d9b05d12fa90cb3e8ab12fee63fa3039d3e8d7a036e971863d8c02cf6891ea size 798250 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 502e706dbc..21c49b0dce 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b11f912305330602d29643441733b6febb1724755a3c19383453cd7dd5942aa2 +oid sha256:ea0af5ca2752a28ba14f56375491ce350402b60182b4cdb879b934d52d1c869f size 766404 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 8c1808adc5..6b8f52ca97 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a78853b3d67ca0499551cff14c7a0992c2cc4e000a04a099856f35ddf0b811ce +oid sha256:539b80cc7359cc63ca15e9956966a7173db8d662ed46c7c2fbed836b8bdef5ee size 803478 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 133a17a07e..7629eac641 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1f0025effc50aeeaa7a6b7c26c373b0a68ff55b87422f1d00e6258c78b1dd01d +oid sha256:33cd7ce4fc0b1a2e62a5760a0dd8c7871e38109ea2f2fbe57a2e64e2184ef5a0 size 838860 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 95e556b6d8..181e676964 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:bd34862feaa08b832209f7ccea73a82e3f0d004a2dad4c6b74bf5b17a2b6a2bb +oid sha256:569b11fda8e319fef4c262d93e0e547b09250647512f83f80a72752ab0b67790 size 870458 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index ac38714ccd..4b41edaa5e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:10af4d8bcd20cbfd779894bbe971b75a0d8d6c03b4c5d974651e4f10528939ea +oid sha256:84438b6eef47b974a8008fca16633ad1ecd3c13fc6d05de73031452bf8f6cbac size 765040 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp index cfcd103356..92a7d9ef51 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e97c85d55723c24413caaeb6b61ccec18add1bfb6d0b5bbcf14f58737459b1c6 +oid sha256:c128c7dc0794f71784bbbafba49dfed12e69ad074a8f1c3d9f8eeb0800891447 size 673222 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..3e3dec2a95 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f73d7cc3eff62dbb12cbc2b46429dcccf235e24c185cff9da579370dd9053e27 +size 794416 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..25c5f32cc6 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:732964e312365f85f617e22df3f1027d628003f76f43a16c7670d98422e90e07 +size 701464 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp index a81c3e5c69..9c9db0a7c6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:829839719877d39021450f06e941bb307b45afc812d7a22af80a09727711a097 +oid sha256:2f69e4548178f64991366516fcf0abd915b90495bde01c01868660e9000470bf size 768490 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..e6b0566d51 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2568d89d6ffb6f4a87f5d39015b8d97c0e1fc70c39f462547cf82735060864d8 +size 801026 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..edaafcf085 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9d98f442b125edda6406925cd5b462a69538ecdca1cb0c50f509b7e10cc221d7 +size 713698 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 104c26969a..6e261f23bf 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:50958e0f4023b57d0019870d3a8df860389336163b5f68ad983c03dbb20df6dd +oid sha256:fb0b21c7a53eb0e2510b760f3757f105ceeb6eb6fa8f8dd13aed4342a9715996 size 871108 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..aab4fd0145 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3e34fc245ffb057babc31eb75cf0390876749cacdea0886d94975f2bd852c321 +size 899154 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..df38acd358 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:81fcfe50efd2622d2e035bdef00ed6438b6df447b84e7028f851eea9b4b8f124 +size 804178 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 04b0105a62..718ce5eb96 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3bec22e1c0d73172a8cfd66ac4836db524f1c049350ac77ef6fba41574b032f6 +oid sha256:b3d11b81a335c0ee189ceb198655ab926d404e08f1de4d879b3af3756f229efb size 874954 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..a770bccb8c --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f443d3379dc5c97f42e88a2d2f6b38ea52187104b0b25415465c3552fbf480ba +size 905516 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..310eef695c --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e6a3217371db2ee831d297a2ae3acec1ba41325cc7894e2c02302627064f07b6 +size 828152 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 740f87188f..4133f96f86 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:700e6d18617fb67ff10150aba697beea73d48b00c6082386f00c8caf48521dcd +oid sha256:9ddb2e1cde3e2ca63857bae94c7a629841250363ac1330260f849692e167999b size 765044 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp index 36851713de..e1aba7f82f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:615966d826698bdd32353214c07b3b62f72b3e90d0d4bcd93017306667b3dab1 +oid sha256:f7cdfe16003879a7a808d3701992d8e9d1e783758d1d567fea14af4ae733056a size 674016 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..630ec28c60 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eefe7a4f3a6c5220dbdc9f736b444d335be45e05bc5d6c9e8ce2e5f99712fb18 +size 794272 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..e04271db3a --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6a5980944b6a2e56b626a036fd6d6b28d21df551b3a6c0025427bf521923382f +size 702258 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp index d6a14a089c..17963e65f0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:431b55d29a959280183046578792bc9cdf62347298c5622113a4568112cb6391 +oid sha256:7bd04f67bf53dd441b54f34e13f9abc83cd067fc2c15b051c9ae70e9511f5140 size 769284 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..55af9fd577 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:704ee6de6ea6a6e1d88e43591f682c914ba16dca66c98ac4f7701741120ecec6 +size 801818 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..b50b1ffd85 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e7727be4d7d00801a94939c95cbe096695264f91d1a7c4eed33058d7bb95d93 +size 714490 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 0b3faec765..327edfacb4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:192e9d0d07b4c963a0e2f5698c01d89e71fc1c90fb2a6cce90b9eddfb658ae04 +oid sha256:df6eccc8f9463bf141eefbf9e222136b428e70cd00f2800cf0f4ca362a1e964e size 665266 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index cddca4b6a0..20ebbfd0dd 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:82d46c4e62018c47f5e7a7fbc93f8c5a9641e6e4bd3ce279e468c04fdf72c2b0 +oid sha256:3392710e46009d3eb2c25eaa8de23ea4ff996692c3271cb30f8a748af9c349d0 size 692670 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp index 5644974cda..b8ffe1cc16 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ac3ee461b22b0fe9d7d006db161e223a3576c1d69468347a3b7d80994bf4b1b5 +oid sha256:d688face18545f4fdb6cdbf82d06dd5849ada130e80225e7acd25a59a9f3c2a6 size 671726 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index b16505596c..d53a410b55 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9ec6dd68a0c90bac304b800afb46873b33d208d81dbcdd4bb56e205fe6b8c502 +oid sha256:229541cf660937d7053f35ce5cca5eb2abdd194bdb092de36ca14436e31d8385 size 699180 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index a5191f6a59..2a6de38169 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:53f2e43dc23899d01e54ed15dacc9352a821b613bfc2cdbde6eee690020c8aaa +oid sha256:8460b094a12a1f7eddb34bc5e74d951930924c6d0af586748a1fce82c4fd1e3e size 735202 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 1f51462a91..d3912ac1ad 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:684aa14df3a2f456e33f88b7b00386721ed366c04f97c8acc0fbc36f3e662ac0 +oid sha256:bf87fb0121b927ab64a4481214aeaa7eeb2dfd28012b222299f3e78eaf1a09f6 size 761620 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 3b9fcf8a98..4227bdbafc 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a377da69abc34ba157b0802e2e5e2fcd73929ed88fe93dc1b71a0e4a5ee2939f +oid sha256:020db6a3a1b10e756541352edfda3e7be7814aecef1367bedf01f39d6d308cb7 size 750516 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index fe0e8bd5f2..94afa35c0a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2043572568b90dbb84e9b679a627dc4fc85fdb09c02f3583c0c0cb5b700ff0ff +oid sha256:b563b6a6a5e6363a9f79064f1a75a49941391cdc44165cf6ddcb5942c779ec3f size 777722 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp index af1556a8b1..820a5d45af 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3998b6039c156e684e5e0f1642d051d38a9929465d388e38665fe58865f5afd1 +oid sha256:be99591409b3369f5a2c2b3854576b9551ccc5016a5818549a8f73d276e103f6 size 755052 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index f2f3a47de0..0f23dfdfbc 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:289376dd73b97658e472488ca7417eae249218df1224b81377f173baca5d61ab +oid sha256:f087011bbe465b58c5bd83821d03053177e74b7e69d6d21d02123c6107abb5e5 size 782406 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index a063f2d0ee..9c654f378a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a8891ec0d7ffef49975092fd2a78eccdf1c4ee18293f7b5a9e6527efb6dda744 +oid sha256:2e6321c850786df4e2195adf8337aa69b48507b423e1c6803653b7159e951627 size 822476 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index d9fb5449ce..eae2b02492 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e01ae95795e3b8413eb1bc12199c37d1e47c200a46949d0fad519d20684021f5 +oid sha256:a10b7150ce65a810e9000cebdca67c80ba6ad9c8dea1550a1c042a6fec4b4c33 size 849928 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 30e521c32d..5ee209c178 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:421a136cf4fd2f4a962d1fe7327654694e91dd335d1374894fc065a845ac8a4d +oid sha256:e31e108a43d2ec6d14bf768f2213e9349c0bb072954e11e20b3a8a84520061e3 size 664822 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..d688f0fbfc --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:58e6fd4f868c601b7cfd788a00ceb07c01dfca0a2da40efb03e1d62646e7f8ab +size 681964 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..243cd0e3f4 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:744d0b58ba03cdc9c8e12aa0cde016de6f2b84b637bb118b9e77264b2e16b82c +size 592021 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp index 9bdd73da6c..39bd6970cc 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d199ec0a0d35553f4a85ae52288a1b09982aab39c25ee166179e066e117547c0 +oid sha256:d894ef4afe5f743de90839f9b6e54a17b6c99fc630c120d9b3cb2f316a925e8e size 663092 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..bada6191b7 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d6c9165f8d66343a04511501b2835097984cd1caa95f9bd150dc7b4655f67ce7 +size 681124 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..f6edd06da3 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f65c1ddc15a26af7d5059df3f68aba0be2a0b8b3e14ef2516241cfbf83170183 +size 595471 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index c680ea78cc..afacf4537b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:92c3843e1e5ee8349e925b3249349ab2eac02b0440569d2136b501a19ae3dae8 +oid sha256:d5ab983bd8cb0404de824265822d904c2c3c1416ecad3d4f1bbf2c2dbdeba5a7 size 732982 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..9802576c69 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:db690c87df7690f07d604f7d741ca4cfc89e68c697498a0d897bcfc94ca32d0a +size 750224 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..5d9d15ef40 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b62fe946bb532b2bfabb0e1e38d7c7156de4a69387f3f5e4e253fd3ccb1b326a +size 662846 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index b2a68815ff..fbcee6c95b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:50a7bcec90044f34e4e159498b9bbaf8ba4105076673f416775bd65e326fd99d +oid sha256:05240d0cf4d34df3fcacf3ba75e3b8fb19a2215b59e0fa1eb2a912fd50677ed1 size 753278 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..1313dc9c19 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:76984b16f9034ecdce9042172df6c1705f71aa787e862d1c4178979adafcb5d0 +size 770618 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..b535e73e56 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b3daae3f6403c2246fe34aa0458648f054f957ea9ee707e459777c1150a92a1e +size 681416 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 5a6ffa0b9a..4d5f35c5ec 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a536d58280ce6490e55aaedc30af36f989e932b95a7337d754ec6a20fd8f4702 +oid sha256:148bca9b77e1d94e0a1a6bb03779d1d9c03de0f295c78c5001b1f9ee22c8443c size 753474 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..2ff2fb4b3b --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:11b8ff998fba20fdb034c11b906a01568cfaf39189faf8c4e47d4bb18b9d72b8 +size 775402 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..bb8b01e4a3 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3484eead6d48f65d857df3691b318a20d86d60bcab55904134c34f86b8c1b092 +size 685656 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 9215b6081f..f2ed0979a1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:45f18b9a9a30e98c07c16fb9a3d7fa83715a8b427d573d1794c56d58afb26bdd +oid sha256:a08e738401a4751c87f58f078ae3fe75c9a342eb4d9016e3d2ea8b9daad44a8f size 824942 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..61408941e7 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8f91fa0af7531a0f497850c7241f99d5f8e6c86565e12493a5397f08226c1489 +size 839470 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..ed06ac3fbc --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a72642ec1d28fd1a145f91148a02d166cd90703f99e8d2d05cb271ed88f35705 +size 752586 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 2dd76679bb..fb72a8f8ce 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:cf1cd2374049ff0a4637aee55a0f36d9f2afe1c0a5bfce57241192c7024414ed +oid sha256:09a15cab79cbf7be26bc65452b8b17cef00864f3f4bcedd56f3231675755d545 size 653670 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 8d5b75ca1a..307ab47a68 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:75218cca2b3107716a0524f99048b0a6614aef5ae25bc621245c8c137b002b33 +oid sha256:3eb7cef01533c78f494c1551f601392151eb00e3ada1396a9e8ee9c830a672ff size 680334 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp index 7d98036cbd..c907dadef4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4ea30834016cb19acf616092b0ffb6b8110bcfdda138322da59add77d39b6be3 +oid sha256:9d6ebbc2a15675ad4214d9ab2d8be82acc6923f3c5199992d95e4aecdad4c9da size 658602 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 95284c74f1..a964ae2fc3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f7a21781d3951183e7ce9f84d5f1283fe6cdab1c6bb2fcc5be6dcd68e4771d1c +oid sha256:366ed851fc30a8110fdb66edbf22a2bf3e1d18076601f59ad3d7f653db4f0ab5 size 686054 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 8996904b6a..749174b563 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2b1e309f1df330c35ae1e4c8826f41ca6279c73cc660dfd349ed133622beb3f6 +oid sha256:8162c77e07aae4ea747f3a5bfef6985a9f8ffbe5e0788b7c3af0dbf75e28eb97 size 723656 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 554162da6a..99d151c07f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b24ef3767fc060458a3bb5796088bd00c9ee330b86a0d87dc9691dd9a2bafbbd +oid sha256:f8fe09cc3426a4550c949715556ce284b68f317e68358e747f5bb8d720976b34 size 749284 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 0e5f9a3a0f..922feac291 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:65f8a1d9d2b99b55046a67ea903027df9725182bf7b29823a507de9805357611 +oid sha256:ec6f9a9d4bedbb571eacef14bc96431bb57b51a55d4eecd35c2d29ee36d6775a size 738970 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 5ef33a1074..f78aaeb92a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4080cb757169babcd5c928d364ffe47f823e39cc21ca6f4eb4318e3dd1783d38 +oid sha256:4f3634b6e1c4581c0f1f3e7bdbf1f9d425a3d63c17c8d48e9cc4398114c30907 size 765386 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp index a7988eaaec..db6f9626df 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9e1e5b645067e66710e37c9831fb5030945e6de8d50966a2198f9ca20794edde +oid sha256:fb0b4c1051029ea89b87f00e4565611d9196ce16f8b5eff0d5a0d5addcf10466 size 741928 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 8588bb7c1a..e134438a97 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5a9b437c981c984681950ac20e36d1852538058215890b2a93c94a35edcf4960 +oid sha256:c918f1c3c7cb4b1247f8b232c505a3214dcb64c79674747877a573d6c5c5c131 size 769232 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 0a80af2fe6..5f39ba1552 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:adf8a1688d1786e3e795ea4a13f8008bea8d7a84de8fee458f6961c3e6d5b478 +oid sha256:90327bfe65945da63ac346f8b7ce030525e06c855fefa3609123cc616e7ef6d7 size 810140 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index a7d88d25e7..25f20492cd 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f99631df937257870e7bfc39ea2ca4130ec8c410848699d4a75f2ac3af297466 +oid sha256:3170523b9758a9b7d6841d2fad4d1ea97ad13f86b760afae2a2e5088945a885b size 836804 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 44e4d5de3c..6ab5c1f485 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2cfde7eda8b705344920a8f894c4794ba86c77d742f54a7a333c92d658bbc273 +oid sha256:d5a93112b20138465fbd298334167340582a8b3fc8e08a91a833694be1688917 size 694780 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..943bde9556 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0cc612ee5b8260353ba849fe2e34c0775f963f729cbaa6999c7c28cdf186c62a +size 718928 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..cb5cd56f89 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:170bc0edb5fa1cf1623f6b6448b83298e99627028734c0f0361972bd246a9389 +size 632834 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp index 2e95d30443..933dbba60d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:cdf0278ec186ad9e96fd12556dbf583d9b2d744ec8244f877c946fcf5e710009 +oid sha256:83e6ae9fe46661b5ac5eafaecf115ef165e97ff60c705bc008c497feabf1cfa4 size 702574 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..02397eec81 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a93daf8d550389e226f0f1f31da87609b73b4be61b67c6093b9ecb73e43c206e +size 726326 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..3b94ad8fb4 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bff472a6fb6fd216b6ab0e40754d8c9bcebb2de74e77a86dd2c67196b0fffcde +size 643390 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index c160b07fbc..dc32e2f8bc 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:76668dd78ae41736f91abb36d93d79e11d076bbfd9809db8506834bfff76714b +oid sha256:522811f73031440bd4b5461483d1e4545d22ddd74c6408b3c81e45e3a35864e5 size 783780 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..11407b3724 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f07cc5dd4233fc26827eb78135a8d655744158984a1e0db3173d8ff043551be5 +size 808470 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..745ebca73a --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:038e49c57101476ce55edeed1a390d314970ea792a1d843a4b46811d710d2a12 +size 716998 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 4b77a84ab0..e5db812660 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:33344a1116de76912dea86a4ac90802bc4b26336e23f624fff7e0ef0c4bc347d +oid sha256:23a6ae13184e2a06152922db8245502c2729fca28803634e6d713e1d5f964b5c size 791672 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..36239ebaec --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:081df7b61e0f043c39aa8436f5a886b1060ebe71c482142c466d9b299e6fab5f +size 815228 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..52d042dea9 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7a7a90aa10d7f1cbbe464668e243e78620a4a43f7711018ec6779d0cd6d97f1e +size 735200 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index c16bcfc4da..7b55b2322a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d68efb9c5638d6acb64f99197ca1bb805b810fa32a8e67199a8a1520b9a30353 +oid sha256:1a7119051359a57132b002afe306fbd9e61ccc00e0b3d32edd17e33e838b8492 size 694784 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..765998a923 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:19247f6b1b2317165c333e3d73a3649405e5a0705578bd51f622c761f1ff12a4 +size 719624 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..04e035d5f3 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0705b57820dfb5c6375450946eb644d9e80a57e4d5929581f690d840883fdae4 +size 633628 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp index 38623ab5b1..05876e8650 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c717d7a2f1287cef8820e7f9a1e883b630e7837fc26a2179350993c3369d4bc2 +oid sha256:a5d244771201af33b295c8fc705fcdc530631d36008236fa2eac94b15c6709ef size 703366 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..d47eedb4ad --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:acac50da9263a3992bda6fdbf2dceffe5d017d6b8cd72134fd4dea16c2868fb0 +size 727120 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..d8b7ee340b --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4acb39e4c47f22e2647d947853b92b61b3e65360e8be4e42d5c49cd685d5bf2e +size 644182 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 622aa95fa9..83d544ca9c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:63d9636cfcbc1d282ebabf238d12da0b1fe4150097197c41e6c188887761e8a3 +oid sha256:26ddc4de7e074c3357730756dd483bc4d3338198f3fb8681d70b96d21e136e95 size 683954 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 691b83d788..596194d275 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0a439f83b39dd3a2b07c975c7531442188cb76de2813d6fcaae589348495c8cb +oid sha256:dd36b11751c3ab4415021ae2a9d1fae950c32578df8161032284fe819f03311a size 710570 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp index 23ff4462c9..ad161cc6dc 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:aa09f780c3966a2fcf995c2b201f60c40b1c67fb38e479ee9ef0a123b937d410 +oid sha256:d59ef93324122fa8cdabe95ed836214ecd107c22ba09fbc30ed3712575752878 size 690662 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index f01e4d070c..a8d74322dc 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:253a0fa2df0b6348183a3fb1456ececacff12e29429370a69c37b5c778ba6b98 +oid sha256:c3c6d6d77a5ce68dd983c44d94c35ea09bdc24aab92c1d2286cc3d6b31f5687b size 717326 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 998439366d..8c8fbeb466 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2593e484c2be4117a3e5d42b36e3597e42a67d717c4e7ae57dce32f69bba62ff +oid sha256:e0361244fa1d1529b755447017660c116d375febaa2dbbd8ecb4fea1ad3c56c4 size 753892 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index b2fa8b0f7b..f43e8a2a52 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:61668d4d03a511c8dfe1b1744dd7831e86543963ca73c5f53f0115e7ff95f501 +oid sha256:b11f2f99e1e442c6edcb20e940a4cda4afc1cebadf82937bb1dc522f4fcb5599 size 780310 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index c3bbd0f3d8..beeb8e820c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4ad0597899ce120bc3f40765d7ced668bbb89ac5774189c2834a432a4c8fe781 +oid sha256:c40b114cfc72fc06e97217fd416771b37257aae7b587307cadea2cce0610b178 size 769204 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 93156b20dd..6bcac4f125 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:889309f9c504886947948b538a0512229ba0d1f75d9a52a7ab69ffa691169d4a +oid sha256:cc7cc09599e577d67fd23c81165f4d958af0e6cabeccbdf4d6fea541e7c97704 size 795622 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 5e3f9dccd1..de4f88fbd9 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1a1d6c3a83c1933b37a9536ad8bdb6ec817c067be0989e3e2dd22fc0991fb2d7 +oid sha256:be7a1ef57512f54353f1639e668461cf011a5f927a37e38c91b1ab04c8223876 size 773988 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index a19ea01c87..be4f12ae20 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ee44b09e66839dd3c29ef8f9ef696418995752ec260bd63526440663e90879d1 +oid sha256:ccf5790af5ebc5f4455f9b0dffef8817cf057a08e3a25764c447ff27de6e2605 size 801342 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 3d25ac2a3b..b30afa46bb 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ebc6025111200033e9f51b0e6582eb3b59b1e3699c5eea52c917dacc46963194 +oid sha256:b000b38ea88a3be7c875aa9b7399f2b89046321968481cbbfd90402f71c6688e size 841164 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 587ddf0211..48c13aa5ea 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ffde8e5c85a45ce741abfdd9cb354fbb0c23179c5bc9d8b192b7e5d352d05ab4 +oid sha256:5c41e2c7956c48699602a9f2a6db04c616d0c67e40f42783350d7936bb05774d size 867828 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 53f5881812..212b1ef021 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a4beeb81f4a5e4d15c0020f17035dba130d256a5183ef4cf76cfd56128d72343 +oid sha256:8e63d74d189fcf857f2e4c8427fd3f8f66609597c3f8591b97182000525d5adf size 672114 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..aaf4c19afb --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8691fed45c15edd3762735d819df94f4f721cc2e935eebbbbbc1f5780d71224b +size 689258 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..5b5a0f2b97 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c932b3362b8ff26cb91795d5cd8ac0d0d224ee4d95935122053672bdbecb1391 +size 600005 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp index 5dbb34610a..d732478365 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:82c9f78f48c360d4fccc72c5ccf29d8a5fb620382f62a699fe8f1c17565ec966 +oid sha256:f3be0239e938017c783751286e0ac376c936d947f58c00b0df21e5594f8125df size 671176 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..4a5d1da298 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:30bc3812258c53b37e9c512220da974957441c24b3658073425c91a8e0d9d2d3 +size 689206 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..c740c0a8d9 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:564e0dd94cd89b01d443046bf3d411e89651ad3c028a594bd4e74e4bb6543144 +size 602765 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index c5b685b545..c74cb0c683 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:47bcc2b881ffe7cb7b6480cb49cc54d713b57d4893b3be220f8def55b9e2bace +oid sha256:6bb9e69e16cf38293bba455aa19bc4fba818c39dc1331352faed48b239d51f37 size 738056 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..22052b0f2e --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:438847cbd81153b2811cdd290d9869012fc4c396cc5da155a7bc36f69a2979c7 +size 755890 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..3035d166ce --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5a4a32858dc963fb6ea2be428501a838ad5f357ce141900771cf0f5c912965ec +size 670930 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index f772a46d10..7882b251be 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6902740a6f6a69320234ae8b84b465b45b32ccec4e8ac190c0ea3df3205faaf7 +oid sha256:4f55a3ce6491892068cc6e2eb34e571e2ec3db1b6dc3a0db282b3e1cae5fbf96 size 761360 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..dd667a3b4a --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:98e8a55b1ddc762fdef033c44e135dcff99b8a7711aa065bead9fff69acc5ec9 +size 777912 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..e7973650ac --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b4ceaa1dec067a24d98b199122d4cc319a76ad06087b743e03351ea590d19a11 +size 688708 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 01b5053cbe..ad78b51bd0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:bbe62ac8c86af7777fc0d9cd55007c5b83eff1adf7ee88610ca63e372c305534 +oid sha256:66a70c390fad8b9dadbf65dc6ac66bd30e13b2f4815bfa994a4404f1e185d1d9 size 760766 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..fdbf29cadf --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:401b78794b1755ae87465b9ead2974d41dad4081f6c0f5905decf7e169be4a2b +size 782646 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..2b1e47a82a --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:974414c1fc3dd8b9af6d9bb68fdcabf489829cf1f03d7cc8602c47eb62f42b2b +size 693738 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index ce969358a8..62c01e60e1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:519577764ad9bf7e08b75c904a102f52ef11a134b660e762b9167c82f85b2777 +oid sha256:dafc9295882852729ee66e0297829a257e4ea4b96a34ba18ee11e3ac53a3c887 size 830212 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..3572fd7530 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7b38705fc54d41e3135c15062dcbe3bad4448d2f14a9cef56f0e7876775a1ae5 +size 847552 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..6a4c1c2ee1 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d5ddf7e849a88ed7d1fc7c256ad6832b1441d679e936bd3bc004887468757d03 +size 760668 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index f176afbfbf..e0a2487686 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:401a8f3d01d8981bdd3076daac0dd914df953a6057a63c6861214500cf206180 +oid sha256:c8fd5f3eae3f35bd407316d26e0e776f45913c4c3f5bbd619184c89f1ce1be74 size 705660 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 7c9fa032f9..c8d033122d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c569f8bd012bdc2c9eaa397c01994720149b0a7a40642574e4ef3e88a6798def +oid sha256:1182fcb0a169c181de5ef2e3e6a253fe786d0c849dac8225d1db9fd861685605 size 730596 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp index 0dab1d62b4..c9fd0840d2 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c3f43889b6be5ec2ecccdf2a5cbbf000156d05f74c12276fc8c1d506874709ff +oid sha256:67ed4828cb3df8fc70a98ae3606913b7d5c601ba505e6022552837539d5f9596 size 711430 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index e1c0dc20a4..4c5df1f2ce 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:70ef44c7e8c667ea075f63e1d8cdf8d43c4d9fc875a192dcc20641c0b96feb6d +oid sha256:a84aa6840a69645a5ce646970f2eb2cf3d048d887c59f26222d74124f68ec116 size 738142 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index efdae6a92e..2c5c429553 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:de143e79cb494c19c9b755882a58d4ff27e640b25edf77336e70a7dd039fe151 +oid sha256:ea51335dc342e7148b236cf774d42ead6336f140216efcf24423685fa34e35d6 size 774808 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index e0560c85ea..7837f54bb6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1e8d6aebd912e413a74fe3372e104eb9d46d1ac5c261b32123bca65fb7b32f85 +oid sha256:07ba200530e0d59890af96553e823f53dc6d800bf4f16d79833bc78085486623 size 800336 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index ce0fbba9c0..039d82d8e4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:078707cd397aa12f64af520cd9fa63a9de5e7e6239b240ef1d2ef8c27bb8132f +oid sha256:882dd70563e83500fbfbb2d5f1457604e6951e699e35585c0ee95dd9bfe3a9d0 size 789972 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index e1b109509e..d4fa6a5913 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:dfac6cc616b82419c1b75d667e0df1b9a5819b1b85d4b9dcce463ec8f46926b7 +oid sha256:405fefa77bec54d5610b8d7c086b0cb92f9dc9192db0003781b2579ba7ba87dc size 816340 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp index d16bf43666..d75cd5688e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7484d34851dda3f1ff8e927590904989ea08f1cd281d42ce2595ea9a9527d316 +oid sha256:c5b007516a7f4c9a5bd783ce132f20e6ca159c87a0c55d8eb4937d6fd7803a55 size 795150 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 74fb57399e..f9a2728591 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:37815d31adb21850686b6605f6902601ce184b7c9f10c955473a636a3fe9178e +oid sha256:40dc3b0854c89dc9c373b59a268fd470cf3a65549a3c8ca92fb2f716e7fa55ec size 821074 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index e6109c1078..ac051e014e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3ed3ffcd6b5009cae574068fd35820a2ef8a7621748dac463fc543ddbd00d69a +oid sha256:15a059289b08bba0460a52130a8e093619f5cf2384c125c6fc34cf6723a2aa8d size 861142 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 696efb4bef..f189155782 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6b16ede03bea9c955695b4425e0e2a379ce82ef2fb905f70fa30b8b8a5d19df2 +oid sha256:36fd4474262fa1dd3f552a25b2711f1362e3c0fa47a98c3a74af79861bdead82 size 887806 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 50ba6c6e5c..4e27de0b81 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5419d85e23a7e62660ef24ac6d331811c98d5ea730037a80b6f76f392d395051 +oid sha256:7e2fc2435b24b232dd9d66b87687e1201599c5498b36bf9142418a5c992eae98 size 697768 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index c8d5f84fe1..10347238ab 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4bdb6db45e2500dc9a2bc7770d72e8d7a334ddbc6e49ce5880139182a966ecf4 +oid sha256:1aec77b927153cd82eda036596eeda959772f22215a9cc62491b155a2d5e6570 size 724432 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp index 0d2df71c55..3809a3ec47 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e0fd3e6d3140129f3b1e6ea91d77e0a6aab5c4e2cb614362dbac6d8da9245eac +oid sha256:aeafdaaa2f9b587308170b37e42b09f9dc0d655d43c95df43fa12dd6ba00adf8 size 704278 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 1669347b43..39b279ebb5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8b39ed16eb4b4da2f57d85a1c3baf8b86fa89b60eba4fe01be6660af771fe088 +oid sha256:0747061775005fa71e4951a8af942be2f98f80df9ca4a872f3c1814f295ed0a8 size 730942 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 9d1a0cbdf4..6da1e42da1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:845a289263d94dc4f64de488f6338882545de7106331f053b3e224694dbbcd07 +oid sha256:4d97937b3500f85efc8e77ffc9ba8ab59369e4b41dc4b2b8c12566b38b70d9a4 size 767754 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 8e9351a999..a09f2d0b17 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a233c206607ab9e2ea1ab8fc6d45ca0de4c82eb95f8e21cd7599cd2ad8f248a5 +oid sha256:740c13cbda6cdba048edc5111f5b8bacd9761fdfeefb72a9449ad36a30997f02 size 794172 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 977d9f886c..b1362376d4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:cc48a824edef1d587463a0a9fe493fc89d006cb9fd42e52ff33da0e5ab4eefa7 +oid sha256:1b207361f9984d5688da24ed6146f431d4688ebde55375271d6cf2da88800394 size 783068 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 9193b7c93e..bffc98b4a5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:baf7c0d3265392d7af4aa0dcf02e37256f018ad199d725c1649832e7392a0a92 +oid sha256:7bc3ff7222fe25c9219ef40e8675c44dd7308bf3ad2fc5b8156bdf63bc850774 size 809484 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp index b4271e316d..72c601c553 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:551288377b9568b4bb31a6fc6cc37e3bcc2a144f6edd49e14df7ac3bce3f2364 +oid sha256:17b0124e0c9468fc7ff55182424c402e0a3dba8b0bdc941485b6b44afd76af0b size 787604 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index dbc188616e..c661cd4934 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8ee6b5f5c048e14d89220f875d87d2f2e170c1a654c7a41c6da2dbd889746af6 +oid sha256:0e9d9010da52ce2f69d3be761b5f732691e23b2d3dffdad72f3f723cc15bb492 size 814910 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 6fa50e4c51..eb5aa518fb 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:63aa4df3956e2a9e2d6fdf0466d2e5e4b2c7aa055f64647e4d7916e2d8399da2 +oid sha256:9474ad20fa7e89c95d8f106d927fd22173df4e2defe0815401f036051aacbea5 size 855028 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index e9574b3dac..c2140f757f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5eb44e5157ebdca6ca135bdae222056a8cd38008ccdfb7f6bda1f46b6a0c50be +oid sha256:2c5cb0fff0c29c24f1f1fcf5237ee3583d894dd9d19a8c1df170f44677da1bad size 881692 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 1f2c7d4f69..87c4ec98d9 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:cde59ae428255d38ec0a5e816c2a44984aa95a3c07428198b6a1968958eed317 +oid sha256:318a50b94b68fe84c3c9c529047e9d677bcf9c5e01fca2443b6e6b7aa748d01e size 698360 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..2c4e940c83 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:16850eab77ea7eafd6671be1add84c99cc06f2e5318b9e0dd5ec6fc901268c55 +size 715502 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..d358387442 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:477dbd9b0dc8b7ba2ca246159a04ecfb4efdd3f5191e5deba50ab800694f497c +size 623242 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp index d31067b46c..376ae0494e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7a2af676be9ccb6f16396e4b6c794a8a5ce9ec85a173b6b10a60b618d837fc45 +oid sha256:bd35b4dff7f182ddb13b838df2876e167bc53c1026658bbf1a9552835446fa2f size 696680 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..56d58371f0 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0e397f79c68de51a1a800d58ca447b5f75833d8bdf168f06120307d84a0529f2 +size 714712 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..c74d9b302d --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d8f226068af3057a7e4c0d6a07fd6c5e02af001e65f6563edccb8a44fac3a0ea +size 625904 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 659437a500..8c8ba0b2d7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:857d64b9eeafaebc2977bef2ae13d9964558b4b6f34c406fc1ba4b8ca9f004cc +oid sha256:9d9d5d8791084302cca013af991153bdc5daa5a35005a567f387277ecef1de3b size 763562 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..05be9c8c28 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8332fa1673b57671ff621187b381f454746095c0c64c3393a2a0fa8baa52584a +size 783762 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..c569dc0093 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5dfc7b5c6432eff9d0d1c9b4a3e2eded11edea251c53d84897945353766201d7 +size 693228 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index e7100cb5df..c641f05ec4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:65591bc33a041a5fdf770a9a52ae85444883529ff35c0af1d035191b57bddcb0 +oid sha256:607980cbbdaa19f40b6931dcc6f8563602276a0c78369007afbe54b98813bdf0 size 786816 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..70387807bc --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:70eb053d2d8d56c1d7d4cc835418565a97719d21d0aa562e87fdceb9ca180578 +size 804156 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..a5f386c072 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3187a560e4f2b698dc10588cec6fa71389be3f993925b6619c311d0804621bb7 +size 711796 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 347b96df81..8b8c0cd12e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:737267c105582e28ac91fcaa89784806a28f158cb74b3f69657d795fd2b6376d +oid sha256:dd8838e8a7439d15d3f924f05a2083c317b43e1d9248b1c8ab42d0f5c1e22383 size 787012 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..698e88d3dd --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ecd51f3e8e498833a05f627739fd38edcf5b04916f276a374231cb3504af54aa +size 808150 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..98be216578 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:186185f3bda1bc51fa03209b9edae3d04179dd52e706e11d7ecd65caf0ef8949 +size 715988 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 2b40a42a2b..2af8f59ba8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8636653250e5a1a0341d32f59b6cdb2a8ab60f3b826bb549edadb2d347746438 +oid sha256:48ea42e82fea232be3bcb1c86e58f13f3766e42aed2d548f4c582b05c771043f size 856458 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..4bd8e29585 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:816be327eed158378a31c07ea082b81955ef8e8740b61b281e495e6954ae3b0e +size 873008 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..b4bb4057d0 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:18177b493b23e7e983a305d963dce6d75855596f4b7ba2233c9f74ced7a54217 +size 783708 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 4f991893b1..03d79932f6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:35f039c22bf4ff50b601ca92d5def5d80758b588be4adb8ace3460ece64325b6 +oid sha256:35cc6459dc65dafef86ac66b3ed4cc0c65b4871210744bb1079044a21fa8a937 size 650308 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index ee809c7383..3d3d07c854 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6ce7c1ac342bbc0c3640f3173f364ec93f32fea559d7bb17050381772ef3a3de +oid sha256:c4c75d9910526139a994743211dedb16b9e6d01e5c08f4baaecacfa9890235d4 size 676132 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp index d55e3a6375..4594a4768a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d78b7b4d05e3fb3eab6def9b125b194b2a97392a8eaec12ee516da533c4a076d +oid sha256:9be18bf52dfd74845d3addfb638a352341c3ae9fd0d5b35c63e9d405c84b5da7 size 655238 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index e708a85c32..0a8a5e02d4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9a991a1e2c6c823abf1ba7c1328649d868c5701d7072163233ad452de1775e79 +oid sha256:7ec76412344d9a48ee096a2afed2753e73fd40de66749a2e434c9da3deccd0ce size 682642 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index d1951d3817..2f72cf7526 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:644bf9d5b0f97ac0a3c081412e7f6a88c30eda4796df66663ad9b293b09f369a +oid sha256:cb82e1011b1dbf4d0efbac907e4acec0bd3f5ed0253525e04c6802df8acb708f size 719456 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 9b7b5efacb..99e69ca14c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5d58a728b47c85cd371c85eb24856dbb222112f0e2ab5680278c7f282de3781e +oid sha256:aa13d918d3a60879466f5e0f1eaada00b4667f2ec0ab6d20962fcebec7efd20b size 745922 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index d6c88bc508..8e2346d0ed 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ed34ddf260a471cf93819f32cf6f1ca7cd67a9c74ecea7c396887d244524b8d6 +oid sha256:6a6a4ad8ac5d275b6550653ae0196a016bd5e72afc7b3e22f0d79aaf4f32def3 size 734818 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 85b75f9138..54e84eaf15 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8f7c34f7923c4b30a379a920a13fc1d893f3feca5a00eb9e82792d292eea913a +oid sha256:d209e20c3ee7f9922fc5a89cb98dae4c0514e67886d29d0ba78e44fc710e4407 size 761974 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 0196122ba1..76c08927a8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:653db49d5161103c9ae5339c1c49e58be5e329d3762b26c0830b25844102c6b9 +oid sha256:363b702974f30c3034370d0bc0198f083190f6358cd1f26136b1f90822d89e31 size 738564 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 18b4bf6b3d..385bd5d830 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:437a1e8910f8a020ced9a075757b1f82118335397e09459ef88169c0e433dcb4 +oid sha256:bd5ecd06a5d90e59c91f118087f7aaed4d2d056f68b399dad0a77431ec62c65e size 765870 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index d47eb9621d..5a14dc6073 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1b854a4ae0de05514e07b7c90df9ce35222a2e445d61cd936836e51a66bc2b85 +oid sha256:1ddc2537e3209bc762e3d39a68e5708fa38df9f08ec38bd2ec16fecb52a87ecb size 805988 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 371d751f99..b8c4215ab6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b1a8ec9fad9d476dd8a35632bf23a5b27c4d922c89e9d1dbff9b950bfeff0ffd +oid sha256:9280aeee19a0c73b4aee8c8cc749ff014f6d7a59f5684cd06d98dace6fdec93c size 833442 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 5d1b36907a..46c74cc3c2 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:44696a7df086d9ddc89a6b9802eec309ab8e746f48df10a925c0dda5d682a80f +oid sha256:bade0944753a0a71a5d081e795fe9e477d01f6b7c0cd4c63a81a36373e47a28b size 666836 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index f51fecf101..f324b6179c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7315e7756fd8c669c9e95f2a1886fee82e4c5842d053ac0b6ac7a0b25342a530 +oid sha256:c99508127ff24e44fb8c7b25b6be2a93ce9dd7f9ea55b53f8c2e18eb019c0965 size 694240 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp index 44475f241f..58a8e85fe3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6edfbf6e839983f60966efc242450333d3340698d59c949fb3dc3b165e001aeb +oid sha256:d8d321c40233f35e6f8bc4b878d9f53f47b83efd4849022e44f88b12676196be size 673346 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index d3be3ba507..6b555fda42 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:341defe5396779eb6cc1b0426deefb7ce8da90e0317e90376f0f5a45521f2cbd +oid sha256:bd4e7b823ca1a800139d0682eb4b88afc853ec93c22f1cb1c4e76925edb2aac0 size 699960 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 666439acf8..4c02b3beec 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b1cd48cef44ef0250aefca8c7fa397cc0685ec913dc2770f910083a9e2886e8b +oid sha256:847d2dbaa3624ed1cf3f8b372d51732381efa0f5ea0b25ac3e1d2880a9611cb7 size 736774 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 9e5d69159d..7ab60eb9ba 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:663efb6485902bb4f736785eee90e8ac945eb58bf93c4fb6cf6ef24ac48618ff +oid sha256:b36fd95fd390d1874e6c42bd05a977df7c637712bd301a58b5dee19dbe9a9237 size 763240 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 336f87b139..cdd5500fd5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0bb4383a17ee072c08387fe16f40e0ab6a14771c8b26ba97aae76c1bcc3f5713 +oid sha256:86b84c5bd6d0c3512b2cdd92fa4395a03e9174a50d7fbbd90408f71f719e72d4 size 752136 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 08e8598d97..00fe0842f1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e7fabc00257c0da34c04907239d9f39583f19c842696d155a823a2042a3ab4ba +oid sha256:a214d48d824be216900aaf58e3fda4567745285bf9d3dbf9d249ce738c2a5dd4 size 779292 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp index dc94c8b94c..8822caf3c9 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e54781e29ae29563f84737416cb78390f9d88fe8c5987e595f26d087ee14c4fb +oid sha256:51528a61292d15cfa2011b200a77585500d9d22d977765160ac0a333b400879f size 756672 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index aeb81ad722..9d8fec1732 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8544043bcf041326c27d4662ef175d49acc1d411e440f769ef01b2e579e842e8 +oid sha256:fc5b46aa78fca9be64fa54ac7f6f073c0d5ca1ff531f48c74b1fee9197462ebd size 783978 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index b8da9cd05d..6bbdb8c545 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:eceef296dfe0542839d2987d6ce5f7762ef366e11b17cde785c5d6c956005f1c +oid sha256:cfe14ccbd15e623d12f5b3d81739cbbe87b0bbd715546b5bdb59bfa75221fe62 size 824096 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 4113b79532..c3529c70f2 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1266b5053dfb04ec1a35141bdbf30c49bf726d7170a10b57fe49084be264d59b +oid sha256:fc3c6836ef2ecd965a164cc60dcec9ec9a68757a4b44cb139e752e9cf29c5a28 size 851548 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index c69453c755..a0176208fa 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4bfbc022e62b717bd86fbbdb9dc7846b8ad9e6e40ed3f966cb158d0706b07759 +oid sha256:b2ed5a3988f1f2c04d78698373de8cfa754a21172464a705f34801ed77b995e2 size 666392 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..a799e6523c --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9222bcdf3f9d80318edcd7f45bf970cb05c2b456ffa1af1c97a6b911ed8aeb98 +size 683584 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..5bd6d89fc4 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4c757fdef3df4079952ecfacf1c9407211db136bfc466cef7115dd3802672b0e +size 593641 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp index de86d918de..7732312c70 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1ef4b2af06077100bbe1e468ad4764297ee13972470d92d4fac860cf18046d39 +oid sha256:9b7a55ca4637f6705c36de623475f74718d9d8b84d22625d0a4f17a1010345b1 size 664712 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..931a1bc714 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ac9bf7db044a57e84c81216c5c8fe73c1cab1f3e354aa97df160c0850766eb27 +size 682744 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..55c4a0d83d --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c1db2f7d589469f2dd997e8b05ca288f46ba22d5a1924f55b540c6880444a197 +size 596303 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index bcfee79cd4..80046404d2 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1ad0375c2254bb9e8f254e94ab4bbc9591d482975b62a00d8bab3d9e9d5d5882 +oid sha256:e58393443a456c5010589b692762f5836774849f2cc24fb876d664e4c4fdc702 size 733764 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..e411388315 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:011f5bcf4963986b7a32a78bce79eae2d57b68df88ff15159139e7edfc7d36f5 +size 751794 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..e2d7f2466e --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:417f5846825d4251123a07a5e904fcce0bd18b0c50ef4cefcf1ddb58de233757 +size 664418 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index f1f8222115..bc86cc8836 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2d38190997c692a7c11465539c08ca782c332b69e73a5b01d6fd29adcdc954b7 +oid sha256:3e7ca26c424cee8d063fb76eded89b78a7bf66c01799fde2de2eff698cb8c829 size 754898 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..3f96ec00fa --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:17667afbc7714c0dca72bedeef2128ae911e33e573cbd757bd168fd95ebb58f8 +size 772238 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..b26ef9bfd3 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:873ec01f6da3f162c754af453e1eda765a83b124b43bfc9043e9ebee13dfca38 +size 682986 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 4ed380fbf2..dc6fb0f4ef 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6b19ec11b6dd0fef3f87ebdf394af9addf1ecb169e651336937631956bb001a2 +oid sha256:f73c8a1ff5ee9b40737598661a9ef1ba12772913091ba5c6a3f0c4cad5750c2f size 755094 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..7adb9fec2a --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3d936dfe497b8c72371dde5b9884f5c5f3c6d1b667bf73aad79f7b37499ca215 +size 776972 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..6531ced5a8 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6a4221feffcf6cf157c7e64b94bc78e3b861ba82f0afb85e0cea37824c61a042 +size 687276 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 951f223610..e938654440 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:57fca1135bbe92c31d227e0d17dd0c8eed482e4a8da7e5ab94d3e82f0bb58539 +oid sha256:cb2ef5a1759ea0b429b1ee123ddfe083e995c614bf41b25b7ae9d4293b114a6f size 826512 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..46cad46439 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d4b588085f9466689257eca7ed2645dcb040a878d788151536cabaea99fbf457 +size 841040 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..5ab8bf40b3 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fd567a1b872033b60bd10dd0b3f3ed7c594e0f1021f6641e76d6254645ee2f3b +size 754206 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 8ddcc77827..58c98acfba 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:57bfebaeb33d796640dae6c31b29ac71bb03a55606f7edb4a9c4b9f06d4ce9f5 +oid sha256:5dfeb92df125475659d7aab8b47a2ccfda556106011f249c0f196b19fad6cf16 size 658448 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index c06c5ad5a4..df4dbfa4fe 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9a68e522f5b1681f1081517715c01a711e561ccb6d39541836a9741042b40f68 +oid sha256:7c869ae3743fa6072514fa90ed66b735748bd57beff80c94b2daee123b3bab99 size 684322 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp index df87a03b56..27de72c975 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:24e427fdbbe68637779d89d22555c3dd5c4c1c0949a941b34e3a3bb69c223293 +oid sha256:32d91e85dc63dd06c3e5333b2e973d6b2592bbe385ba86399b6876586f299b87 size 663378 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 118994429e..c466486292 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:eef0b53b8ef9c24149b09f5301d680750931e6f48822370fead5594062da04ea +oid sha256:6eb788b85b63fdf9286ac5ac0626cfcf587c05fe865343bd6d5d7dbca9a19711 size 690832 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index e7ce4e56ac..14f5c9ea23 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7480249b2c226d981c2f289b75bda893618abf7fd7d7b29af782a10ddec4be26 +oid sha256:0497d004ef60f025feadf7ac9adc903c21b9ca04f8a16792fe3bae7ef7ae243f size 727644 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 7989615d54..0d96f599f2 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:476432362faedb7c482c999fedb05ae8a9e3a63a161e285392fe20eb72c67106 +oid sha256:0b38a45042ff8894652762dd6d8c8c44b768bbff8baba66a33b5cdea59f7b471 size 754062 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index f2f72f3680..5e45c8fb58 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:abb486fb872a2549f660d001f45157f304e577dbe82b8049b30265485793d9d6 +oid sha256:97667a82c967e8394c179157113a05b15b7aa18d5f7ef6e4127156c5e488ce0b size 743746 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 9c0477edfd..aececf8769 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ab17ddf85b42d468d3848681c615e7d9b8aa86b58d85e82f2ed42b9f24c25e6a +oid sha256:7a396ef3bdb691fefee96b30c65e12c4295ff44e9a24fe6c868034d9b7a692ab size 770164 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp index c3d67a7041..b27fbca030 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a3be0901f7b8e4cc85df91b971eb8eb92855efd9d453f503bec09eaa9560fa75 +oid sha256:00a8d24df08eb567015eba6c3caa59fb42a93a24ddf087fdb644839f2d7755fe size 746704 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 10f21333ff..badf5ff3f3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a0931d06ed378a543a9545550d63bde209113989665e4100f0d32f079bc72688 +oid sha256:2e7ac59dcb11573b114e3d5328049a18f7736bf214f479a2ecc440b324aa8165 size 774010 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 82168242a1..3b943d2092 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4bb2cea2d025bc93b64116db19baa2ce040c43a8e9f2d8eebe1c637d22af2c35 +oid sha256:d807a408307ba36263d0c33c2529449f0e5271bd35c72b04ef57a620f2d633bc size 814128 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 462ca48f7d..d03b9111c8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1660fdbde717d5ca0bb84a6dc9346763308bb0acab0ec86b1d93651cb37cfa69 +oid sha256:18b69602265759b1544581648af683f3eb8f07e3fc6bc80ab841da8b16e8f5d9 size 841582 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index cfbf5c6711..0c1e3e64ac 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:22afd851a3ad539eb872d0e450ee35e411d3b18ff0bd368ea55c5ec4f9627fb2 +oid sha256:81ce564dc0678ab87dfb6f1e31e66028f20f26f48c687512bcbd9e33f71ff56c size 733238 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 04d42783bb..69e5d6b60b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:52a0cc34e8b2d1deeebbbb56259d348b47a4c92af6395d1b89e870b5a08c299d +oid sha256:a4ba765e952083cdb0c1f9bdb41c5e68f191dd49eef91008f7ba752420484e19 size 760988 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp index ed5163ef05..b63ee1c7a8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:69faf900729df40e9407f076c66863d38182f2409131d21385852bdc0422d027 +oid sha256:42fcda64d10fd4b734f95f0bd87c4a2d3a32acfbb615479d55487f3c0b556d8d size 737578 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 67c95c8b4a..cb75d24979 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e5fdff387148131d9fe8a77e1a177b8fda33205323b2af132575250fcd985db4 +oid sha256:9beafd81313a3620e802ff7f4a384eecaca0eea28f3df63a9232b25cc7eea576 size 766808 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 91dbefbb15..846013b828 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d157ad5e365d9b9b6c6803db0337ad1f619466d25ce23b2333c40958395853eb +oid sha256:0763028ed04f3f6cc61411e42e58683e32b1cc65759af0e60a29e140031e0d35 size 801992 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 3da95896a6..8173b3d979 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4de76b492e9b00f19c557100e87c5b64b634849c7d643cad944e330d859f5cc4 +oid sha256:c6b1fa091d02753e0525005bae5668069b048791ccf639dd4ebcd4ebd3b5e961 size 829988 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 28b513552e..8952268f05 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d4f534b062f9ed40390e8478c3239e3e7448a8aa334cc79dde472408ce2d3253 +oid sha256:67e7ca44a29287fcbdfff35b2644ac3da1b7f7b448cf71316adbdc3a2f0882dd size 833338 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 6df9ab622d..cc15f03a59 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b96353a60025f4ba27cbdfe041d006a98c712fa91979d9c92a8992fbb0f2027a +oid sha256:ee98f428bbec987886d1d4f9fe003a7bec936abd7a7b9c9a4e18c0d22af4bdf6 size 856746 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp index dfb0d967b8..0c7c7a397d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:16f4458bb2427647f779163c8b262342a1258a406d3325a6efa4b1ee5ec0cb39 +oid sha256:c73ae027f6fa57da513a24da51eff5b238f1cf7d56060160cb047876fa6f5d31 size 836494 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 00ec69025f..faeb3196ab 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:73c075965e551a803999fcb7464b02278c4e42955a6674d2467eb05cc17490b2 +oid sha256:b3a69c4cffe44f4b631c4c5585e5024a7a9b82871079b5f5006f59bcc599d751 size 861382 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 6c2b8498a1..f8fb8e9b5a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7bc04938cd5523cdceb566a7768f4e949b679406870dbc36d9b98348c721afd0 +oid sha256:893e6b2df728a55a7e48074dce893ed487c9159b41b985b6b180a9e66e1470e5 size 904360 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 64d754ec54..21cfe51cb6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:588db0db5e65e972faf80273c4a0482c58718857ad61c94e8d5eb9243ade791e +oid sha256:932145bc9c9a45e4bbe17ad334c7dd74e688a2f01190035c4e60631c28f9513c size 928952 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index fb0be3e3a3..68c4a9397b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5098dea88c8f2bc37c9f03024cd3e9125d116aaabd9b8197fc5b939db4ca3766 +oid sha256:2dd5190fc513950e12fe4ed76e37b82390381e200c708f9d9ee893f6c9300f4d size 695992 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..51e356d983 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:01523af3cc9e9b099e08330885ddea97001596e3eded7400e49288223c67de51 +size 716736 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..ab3d082c1e --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8b8c0de02ab535c7fa2ff799bc32a0f765f441a278cf3fc660487e8d70fc714b +size 614163 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp index fe52ebd060..23576a3373 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e18911d7307a0feca21b31d27907c04bb8d4039b8e57e5c0d6aaba5b7c45271e +oid sha256:76df2711ddf0aea53c71679dcc65a2d95d34786937facd7be5c840ac68855832 size 691156 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..8d7994da4e --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dec6af08603cab35d21370479e07d3d6d44bd3f93774f4b3f8f12fd1be1c3c1b +size 710320 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..f64430156e --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:63fbb908f051f9d01f637428c3ef30ef419e8f93011448e4d10416b5bafa1b0b +size 629406 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index fe1dc90f84..3b4b5215e6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:33450b41a0902c1c66e8edb40a52f57063e7a1a8fd04a6548e8355dd70b38064 +oid sha256:43de90d76b6065497c0ea838cbfe51d787c7dbd0c3e8f30329f049aab9b19967 size 761538 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..3d79515f09 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a35b536747b0a574be21365c3dcc4456bf1921c2ae6c81ba85fffe9add887a02 +size 783122 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..4b61236b81 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:728fe89413d28fd5cb7efcfb6e2b76c96782a83a03a711a62987cf8bd8aab897 +size 684842 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 75c9b825ca..16335cf93c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ea3fb0bd470b4338fff5b261643349ba47ea7d61361482c1b00101cd67a61c84 +oid sha256:b846492e417c782b18cdd9e06b0ae114e8f9be26aced56b72cd1ee55355a7602 size 802900 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..766198fbc6 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:13b0f6395aca8259eeb5811bc989499b0424890c380e6e87323c009a6ee3c386 +size 822756 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..4bedd9d92b --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:db6319678cf54e7c677b9bac51387f6f8954d699014144e50076997a7b7eeec1 +size 725364 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp index b24a2d6c69..4ed73436d0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a059e584060691e4619fb5f8eb45b8b282ae23bb6d39f38131068e861b5bad54 +oid sha256:5bfaf5f96938c8cb018221e15375d3193b93ddd899111d55a32f5532987884d8 size 802946 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..3a4f711e98 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:86eb0810ea71e5acfa84774092bf916d071eb6ecd5c045e206608cc0b23efd93 +size 825812 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..d5a7372a3e --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:53bfb00de353e0b44ea9806340c13db01a8c809e8405a82157f4c48af7a82acd +size 739422 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 83c478681d..f73664c062 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:81c06edb9e0f51986571df51308ae8afd55397cb4ca35260c5a22f761c2556d7 +oid sha256:cae30918327aff8788b2febbbf4d3bfff250281fd2158905ee24f4e120b44489 size 874564 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..3c32a18a31 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a8462f1a9aafee2946425c9af9a22976404aef341f85094485052062bef1be82 +size 896244 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..12abb9959a --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:42a3cc1b6f2239b24ce9ef09c0b7c7a2ebccc490f694292796cd6c291f8662c3 +size 794314 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index af8dc5b768..46299777ff 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7915e0ea327a04fec09e37029def3bbafc1e0cd49f656ad647a04a3f31a57eef +oid sha256:cc59ed020d3947faf24f39ea07fd1d6f78d2193adb407b170965ad962b23e340 size 691452 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 6edcc42666..383755c60a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2421a904d6c85547b029a7e6a11a7d5c0370d753ac17c0236dbeeafe76d9716e +oid sha256:a14eee5a9f77460804648094743f5685403d4317073d17eef38f9f419bd86291 size 732126 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp index 0aac90e726..0399b6724d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9ebb6c6ef5d60e2f5dc35227a902636287ac403b786e6e39e0b6975c5c388c56 +oid sha256:46708f22ef7d55a403409308d08fe20072a4ae52ac0f6af4f146e9d01f742468 size 691498 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index c4f1bcb08d..0efdfe9c43 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1e3d23f4bf2f7d27160559fb6f312259900598e49901db36fba4f2021d695972 +oid sha256:7f3f2ac1fe48bb10785d0b353fb22af9ae42a36373f517553cd960aa11944feb size 738192 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 76fbb96ad9..4213e0964a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:153e684f6da98160b7e831a3c553e41e7c435079f94faae31c91a165b969132d +oid sha256:8e5e53fa3268cbf14b1e28072a59ab7e6fabc1f5ef6850e158d7075f09775fb1 size 759464 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index d422f434e2..5ee12a0b09 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:cf82eda0839eeb024ed6254810f12f01eb794878d247a2d06937d031164a623a +oid sha256:bddf7f9f4b8122ae82d42084698acaabc1053015130648c6bcbd1430d166d448 size 801866 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index f37dec9c11..b57e8476d5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f1b1410547cfa358df34d4acfd5397c08dd6d5abeb537e6602e72d36b2c4dbff +oid sha256:41765b6dbe6e651e2e84aa4de093efb9df32e3ee42b641c6f623ab2e99a776c6 size 777194 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 7e75d7a2cb..7f3d23650f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:546648f6072b2254dd99ce895c44e5e82b3772c415784045dc19b153d6bb5044 +oid sha256:1eade040dd289cde0a071eba2a3f11ee80caeb9ade5c9a999c2da26c015016f2 size 816882 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp index ade3a9ef33..9668cd2232 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ae4ec291f28d4d9baa1a1c004f934b9cf011a5d4ea7bd20c79c9af2ff8d3e9af +oid sha256:dde70c68c51ed00e713f90c70abb173b4e6d4ac0257781217d10d859c081d0c8 size 775960 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 42aeb4f00b..5f0802e5ba 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9759934177e410257f5d7ef5d0e7f13e6e15403dd1abee2036d5058f6a3af2bd +oid sha256:df173da9728b07346fea5bd1066b9b55fa14de71883ee9df06e5f78595edf7c1 size 822110 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index d3aba39c23..0342cd4cf1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0772e04480c534cbaed88fa2b84239a9c34ea259bbcfbcf24f45af88db0fd13f +oid sha256:8e4f9e26b093add490eb2664ced021432d85a168913bd30db3222b5d3015f6c4 size 848414 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp index 05fc6ff52b..d276374864 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8452704db701fa68a494e6b7f3e4aea398f4be7cec1d028a465af1d2282f57b2 +oid sha256:7f85724c4804c5f32d2655c1b2ce8da478ad1d2950e6af4daa3ff6a548e519bf size 888350 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/kernelMetaInfo.h b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/kernelMetaInfo.h index d650b9647d..fab7286a1f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/kernelMetaInfo.h +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/kernelMetaInfo.h @@ -25,7 +25,7 @@ namespace kernels { // clang-format off -#define TLLM_GEN_VERSION "1739609a-dirty" +#define TLLM_GEN_VERSION "851ffd49" #ifndef EXCLUDE_SM_100 extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; @@ -183,10 +183,16 @@ extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrCh extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin[]; extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin[]; @@ -209,18 +215,34 @@ extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32Va extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; @@ -233,6 +255,12 @@ extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrCh extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; @@ -323,25 +351,46 @@ extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChu extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvGmemSepVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin[]; @@ -352,6 +401,16 @@ extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128 extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; @@ -364,12 +423,22 @@ extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256 extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqSkipsSoftmaxQ64Kv128Static2CtaKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin[]; @@ -382,6 +451,18 @@ extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256 extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128Persistent2CtaKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128Static2CtaKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin[]; @@ -397,11 +478,20 @@ extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDens extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvGmemSepVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin[]; @@ -412,6 +502,16 @@ extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDens extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; @@ -510,10 +610,16 @@ extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunke extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin[]; @@ -536,18 +642,34 @@ extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeq extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; @@ -560,6 +682,12 @@ extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunke extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; @@ -650,25 +778,46 @@ extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunked extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvGmemSepVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin[]; @@ -679,6 +828,16 @@ extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128Page extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; @@ -691,12 +850,22 @@ extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256Page extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqSkipsSoftmaxQ64Kv128Static2CtaKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin[]; @@ -709,6 +878,18 @@ extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256Page extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128Persistent2CtaKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128Static2CtaKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin[]; @@ -724,11 +905,20 @@ extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32 extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvGmemSepVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin[]; @@ -739,6 +929,16 @@ extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32 extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; @@ -805,32 +1005,60 @@ extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCau extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; @@ -977,10 +1205,16 @@ extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCau extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin[]; @@ -1003,18 +1237,34 @@ extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ64K extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; @@ -1027,6 +1277,12 @@ extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCau extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; @@ -1205,10 +1461,16 @@ extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCau extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin[]; @@ -1231,18 +1493,34 @@ extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ64K extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; @@ -1255,6 +1533,12 @@ extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCau extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; @@ -1433,10 +1717,16 @@ extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCau extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin[]; extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin[]; @@ -1459,18 +1749,34 @@ extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ64K extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; @@ -1483,6 +1789,12 @@ extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCau extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; @@ -1663,16 +1975,28 @@ extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrCh extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; @@ -1699,16 +2023,28 @@ extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChu extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; @@ -1735,16 +2071,28 @@ extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunke extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; @@ -1771,16 +2119,28 @@ extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunked extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; @@ -1807,16 +2167,28 @@ extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCau extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; @@ -1867,16 +2239,28 @@ extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCau extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; @@ -1927,16 +2311,28 @@ extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCau extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; @@ -1987,16 +2383,28 @@ extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCau extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; @@ -2180,10 +2588,16 @@ extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChu extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len; extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len; @@ -2206,18 +2620,34 @@ extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32Var extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; @@ -2230,6 +2660,12 @@ extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChu extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; @@ -2320,25 +2756,46 @@ extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChun extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvGmemSepVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin_len; @@ -2349,6 +2806,16 @@ extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128P extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; @@ -2361,12 +2828,22 @@ extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256P extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqSkipsSoftmaxQ64Kv128Static2CtaKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin_len; @@ -2379,6 +2856,18 @@ extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256P extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128Persistent2CtaKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128Static2CtaKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len; @@ -2394,11 +2883,20 @@ extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDense extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvGmemSepVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin_len; @@ -2409,6 +2907,16 @@ extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDense extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; @@ -2507,10 +3015,16 @@ extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunked extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len; @@ -2533,18 +3047,34 @@ extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; @@ -2557,6 +3087,12 @@ extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunked extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; @@ -2647,25 +3183,46 @@ extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedC extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvGmemSepVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin_len; @@ -2676,6 +3233,16 @@ extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128Paged extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; @@ -2688,12 +3255,22 @@ extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256Paged extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqSkipsSoftmaxQ64Kv128Static2CtaKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin_len; @@ -2706,6 +3283,18 @@ extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256Paged extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128Persistent2CtaKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128Static2CtaKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len; @@ -2721,11 +3310,20 @@ extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32M extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvGmemSepVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin_len; @@ -2736,6 +3334,16 @@ extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32V extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; @@ -2802,32 +3410,60 @@ extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCaus extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; @@ -2974,10 +3610,16 @@ extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCaus extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len; @@ -3000,18 +3642,34 @@ extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ64Kv extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; @@ -3024,6 +3682,12 @@ extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCaus extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; @@ -3202,10 +3866,16 @@ extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCaus extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len; @@ -3228,18 +3898,34 @@ extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ64Kv extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; @@ -3252,6 +3938,12 @@ extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCaus extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; @@ -3430,10 +4122,16 @@ extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCaus extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len; extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len; @@ -3456,18 +4154,34 @@ extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ64Kv extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; @@ -3480,6 +4194,12 @@ extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCaus extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; @@ -3660,16 +4380,28 @@ extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChu extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; @@ -3696,16 +4428,28 @@ extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChun extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; @@ -3732,16 +4476,28 @@ extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunked extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; @@ -3768,16 +4524,28 @@ extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedC extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; @@ -3804,16 +4572,28 @@ extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCaus extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; @@ -3864,16 +4644,28 @@ extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCaus extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; @@ -3924,16 +4716,28 @@ extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCaus extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; @@ -3984,16 +4788,28 @@ extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCaus extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; @@ -4056,83 +4872,83 @@ struct TllmGenFmhaKernelMetaInfo static const TllmGenFmhaKernelMetaInfo sTllmGenFmhaKernelMetaInfos[] = { #ifndef EXCLUDE_SM_100 -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 127216, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, false, "44414216cf1882b6ae882a8ddf301bbb7b05c407b78fe6369f0c011061253b4e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP32VarSeqQ128Kv128StaticContext", 127120, 512, 2, 32, 1, 0, 0, 0, false, false, false, false, false, false, "e532e93b3d38d2ed32607ca65633b65f00d8dbe0735336df37ed356a52817cfe"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 200808, 512, 2, 32, 1, 2, 0, 3, true, false, false, false, false, false, "9741e3ad7b62c0a6555e946c5860f66d9a1ef359297eb0948cfd2661f944b670"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 196344, 512, 2, 32, 1, 2, 0, 3, true, false, false, false, false, false, "7468d167a0881a4fc257057c40e56182f2fa51b1b18a7561654fb9c074f9541f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 167008, 512, 2, 32, 1, 2, 0, 1, true, false, false, false, false, false, "2cb760314474aa18e294da827e568a89b0f8aec22126f2ae979c339b0def8bc7"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 162544, 512, 2, 32, 1, 2, 0, 1, true, false, false, false, false, false, "f2189679c67353366a8e84ee882c50233ef828c100d65be020c74831d0abfec9"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 127216, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, false, "9ec8e5a38d4e53a3e5bd4ec211c96590649151243dfed44f8261b4d82bd9e2ca"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ128Kv128StaticContext", 127120, 512, 2, 32, 0, 0, 0, 0, false, false, false, false, false, false, "8fab354d54bdd34434aa3f9de82714c3fbe58405fd16e085179a9b79f562f4e0"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 169152, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, false, "c3833a318b85a675b8ebd23fa9239987b03cab3cc383057fe95104565507d743"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 167008, 512, 2, 32, 1, 2, 0, 0, true, false, false, false, false, false, "93cda54c65fa5b17222c972d319edf1fcf16a143452b6c706be18b7e97cbd166"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 163664, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, false, "a98fe960926d24768b1e5abb9a8bab00b0077426637ab9203c5fa77363574161"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 162544, 512, 2, 32, 1, 2, 0, 0, true, false, false, false, false, false, "f50e352bf4916449d64496099d8f9985252a2c586898b739029d3a30b9f93ff5"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 200808, 512, 2, 32, 2, 2, 0, 3, true, false, false, false, false, false, "bd38e46b3f162544caa6337f3e49edd31bdabc8c167641d6fe029d52896b7cc2"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 196344, 512, 2, 32, 2, 2, 0, 3, true, false, false, false, false, false, "080371c16b9f86843e6f1102c6161d8c36dcfe3d049a94ce862748964b7817c6"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 167008, 512, 2, 32, 2, 2, 0, 1, true, false, false, false, false, false, "e8f1662c0ff27747336e1088e577352e5b513aeeea8b5d85c42addb8758b108e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 162544, 512, 2, 32, 2, 2, 0, 1, true, false, false, false, false, false, "2dfe926e62058d75e524a0b00b8bfc3b192bf08b5faae3110081b46b133a1ecc"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 127216, 512, 2, 32, 2, 0, 1, 0, false, false, false, false, false, false, "f8bfd239ee9198d8f224240a939421bc05fe2742d4e47cfc70cb964245d642ef"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 127120, 512, 2, 32, 2, 0, 0, 0, false, false, false, false, false, false, "d4f885dd6d0278752145041c20dc65b1f244f719091b661f717f3b6368d2c2c4"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 169152, 512, 2, 32, 2, 2, 1, 0, true, false, false, false, false, false, "3fcc0072f443a1b284316fdd64eeaba4560fed5528ee5ca666f8abef96f4b916"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 167008, 512, 2, 32, 2, 2, 0, 0, true, false, false, false, false, false, "69e41317b1bdde2a34a1f29b1b1d43ab38b06731c30b31a833ae41db433a20d5"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 163664, 512, 2, 32, 2, 2, 1, 0, true, false, false, false, false, false, "13beeec577ad1a14514ec1276659cea7d9359f8764f27386133eb5efe8321fb6"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 162544, 512, 2, 32, 2, 2, 0, 0, true, false, false, false, false, false, "e95e92145cc632219f87f9a46da01cceb2c9e366ffd4503008ef0fff6bb6dc45"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 224576, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, false, "133342a71b6614dedf1a9dd23318719bf75342b9196a29a22a06362c4ef8e533"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP32VarSeqQ128Kv128StaticContext", 224480, 512, 2, 32, 1, 0, 0, 0, false, false, false, false, false, false, "ded6a5fad72a0d964e3f2fd58723494695763d32a1d68cf8f7492380112a479a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 213608, 512, 2, 32, 1, 2, 0, 3, true, false, false, false, false, false, "6a63fc88eedeb3fd86fb41ae8f019331746db7bdfbc98060e22132ba71fb4d6e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 207096, 512, 2, 32, 1, 2, 0, 3, true, false, false, false, false, false, "07f6178b53c907ffb0e02ddf655221034c4e616c53b29140ba7920198f375537"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 180320, 512, 2, 32, 1, 2, 0, 1, true, false, false, false, false, false, "5edd4de84ce58556ef37b38a9943ac513f3ca8cb1489d7731567074b947fc887"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 173808, 512, 2, 32, 1, 2, 0, 1, true, false, false, false, false, false, "d15f20d41424b7fdd3fe2ab1df5bd5522cf4da40ccd7bfb998a1e6c10a2b964c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 224576, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, false, "43ed84180394d39d8d11227de308a7c6510767c6b3d9425d032ac54335d33955"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ128Kv128StaticContext", 224480, 512, 2, 32, 0, 0, 0, 0, false, false, false, false, false, false, "971d4b817e7234789d81632b9a23b4a645cf973126ae2c51fcf0dc521fa04522"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 182464, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, false, "cc883b5ce9eec560e5295452b790b17050d20e8b9eaef59f54b4b4c3259a3c69"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 180320, 512, 2, 32, 1, 2, 0, 0, true, false, false, false, false, false, "cb4665e95180585bd218241bfae93ad4b6654e286c0e40e3acc43b23c0517035"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 174928, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, false, "008b72a9753c543c5c134bb118c002a63610d0203ddaff112085edbdfc222983"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 173808, 512, 2, 32, 1, 2, 0, 0, true, false, false, false, false, false, "ffcef1c93d795e7ebfafc869df59db37925a927c820e825cc2ea6cc6c5aa1f92"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 213608, 512, 2, 32, 2, 2, 0, 3, true, false, false, false, false, false, "3c18710fa59d29d9c912b0b6d462e9aec01c07a243ef892955356c2c6adaa333"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 207096, 512, 2, 32, 2, 2, 0, 3, true, false, false, false, false, false, "02365293da7786908fb04fea0ab75d39e0a5be6e977991e02b39f56f36f280fd"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 180320, 512, 2, 32, 2, 2, 0, 1, true, false, false, false, false, false, "2ec2f3cefc943f8247403caa81441d8864f8b748cbeed3b30ee27e122ae4927b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 173808, 512, 2, 32, 2, 2, 0, 1, true, false, false, false, false, false, "5fcd068e8500177b73579050439bf11c99154ff052e1c05491fd52ef73ac17c4"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 224576, 512, 2, 32, 2, 0, 1, 0, false, false, false, false, false, false, "c9b6370b1e8685a45b62ed3e65ae5e0f2b02da13e45a7cb3c9b2ccfcf79a1c8c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 224480, 512, 2, 32, 2, 0, 0, 0, false, false, false, false, false, false, "c39c3d1d1933125d7183f2623a5d26b668a8550d41b237adb6ab19dd9fac87a2"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 182464, 512, 2, 32, 2, 2, 1, 0, true, false, false, false, false, false, "470add1552a41f66688a4d02aa1c4deea35812053539765dddeff5bbc7850c0f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 180320, 512, 2, 32, 2, 2, 0, 0, true, false, false, false, false, false, "8ea816fcc200f3d5e5dd921a3dda840a3e610354c2928e93c1fcc35d18f17096"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 174928, 512, 2, 32, 2, 2, 1, 0, true, false, false, false, false, false, "380330aa71d8f6ea41b8d803c8afcb54afb39727414e1943d994a739c80313f8"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 173808, 512, 2, 32, 2, 2, 0, 0, true, false, false, false, false, false, "79f13d35e60ce8522731ed758debebe9bbfe7d7b53a4169db23ea573988bfc5a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 64752, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, false, "1fd9218488492e8da6f6d47924d6260efd1377b835bebad9528c4c597dababf9"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP32VarSeqQ128Kv128StaticContext", 64656, 512, 2, 32, 1, 0, 0, 0, false, false, false, false, false, false, "12a9e25176f4a88cc18344923637342f3a4e1554346456384ce91ab08488e8e7"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 158840, 512, 2, 32, 1, 2, 0, 3, true, false, false, false, false, false, "57f52ca096a9fb1ce9914fd59a4a476cb6e04ccfc27d8d3edbea7c929259136d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 155384, 512, 2, 32, 1, 2, 0, 3, true, false, false, false, false, false, "0bb112008d1d32f050bf4e891247b3a652db72825a7b650f0439434ff4007ce3"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 124016, 512, 2, 32, 1, 2, 0, 1, true, false, false, false, false, false, "72b7d0d237c661b03397b62e64aa6c9337cebe9ff08d7ddcce173983e6328491"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 120560, 512, 2, 32, 1, 2, 0, 1, true, false, false, false, false, false, "6f25ff6478b7206437a8d0097055804386731b0516571eda9b3e499f48069ebe"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 64752, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, false, "443204973cd4b2354a446d3bfa493d4b78ee14120d8a1697f93f20204d25611a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ128Kv128StaticContext", 64656, 512, 2, 32, 0, 0, 0, 0, false, false, false, false, false, false, "de98fee30778b9da736d1ceaa73804a7e140246ca75970a588ddace49cd07010"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 125136, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, false, "ffaed78083d160b0702344084bfe94e8cd347f0b3433c95709079e70e317089a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 124016, 512, 2, 32, 1, 2, 0, 0, true, false, false, false, false, false, "af0288cbb231cf5f6a3b359c5a3b4e5dc1dc3d138e1d7e81ce2a40ef645d6459"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 121168, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, false, "1ae08c83389bc3413809f08844d8f8403da55dece979d5d6ef729e3aa656513a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 120560, 512, 2, 32, 1, 2, 0, 0, true, false, false, false, false, false, "ab170c4b1dc72cc6f8aef5629ab569a35b6407381a4b323c0452da2c9d3ebee7"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 158840, 512, 2, 32, 2, 2, 0, 3, true, false, false, false, false, false, "3073fe38cf8c95ac00cba404e9a6b80f4e3c1142882cedc9b5bf451b155907d0"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 155384, 512, 2, 32, 2, 2, 0, 3, true, false, false, false, false, false, "7621ea164520a8ab0393edd2a4a6502f14df82128f087557f9efd075cd2b419b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 124016, 512, 2, 32, 2, 2, 0, 1, true, false, false, false, false, false, "1ff4ce3e0100d04991394e5de6084cab4d86077d34ce17466fb25ae26a1a47d2"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 120560, 512, 2, 32, 2, 2, 0, 1, true, false, false, false, false, false, "5674f529d062af65acb7ba876f1ec740b80a1e57478413cd920f875f43107e35"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 64752, 512, 2, 32, 2, 0, 1, 0, false, false, false, false, false, false, "8a509639866b1a0f775581cf18073a8d8384be1a6ad9b4c3e20d001094958254"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 64656, 512, 2, 32, 2, 0, 0, 0, false, false, false, false, false, false, "d02348e3f57790f177ed67ff340c028f71806c7f8525e72583f86e6ed05f4322"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 125136, 512, 2, 32, 2, 2, 1, 0, true, false, false, false, false, false, "a27d4237b777f33e0e2e2c838db1cfc12ebb28b683b9ac2d2dda0566151cb002"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 124016, 512, 2, 32, 2, 2, 0, 0, true, false, false, false, false, false, "2752c025ec18771e491a610c48a52a9e9436166442b9314b7cd57bbbaa1f4bcd"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 121168, 512, 2, 32, 2, 2, 1, 0, true, false, false, false, false, false, "e33250d483e8d76c3d1aa77ec6c8242e55ac56295e8ef58969e323ddfea80f66"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 120560, 512, 2, 32, 2, 2, 0, 0, true, false, false, false, false, false, "81a956ef2a454c6493f8df3e4e59f8216c815a3f1a3817854957822e5b9e4375"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext", 164208, 512, 1, 0, 1, 0, 1, 0, false, false, false, false, false, false, "80281a4d942f395c94d6ea37291a7f88742d8525279f83375ba23892e910b8bb"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 127296, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, false, "2260c7504648e70eb2a16a24627679d76e9fc5d0a9e736452a4ac8ca4e25f27d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP32VarSeqQ128Kv128StaticContext", 127120, 512, 2, 32, 1, 0, 0, 0, false, false, false, false, false, false, "a446cc59a6bf16f2a5f04d1e89781e12fba36b686de3dd75687f93634af7709d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 200808, 512, 2, 32, 1, 2, 0, 3, true, false, false, false, false, false, "53789b1bbf5145756914025e3c191297408e66c7ea44ed7018e761e914f12ec5"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 196344, 512, 2, 32, 1, 2, 0, 3, true, false, false, false, false, false, "5fa49b67fe5c7b591fe263044c0f9bc09c2fe8a68a2517ed37c7d5dea98b8f1e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 167008, 512, 2, 32, 1, 2, 0, 1, true, false, false, false, false, false, "00534a220b0678380d4d09a89a25ad78801bd3d5029d6ac613daa32ff7e6f611"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 162544, 512, 2, 32, 1, 2, 0, 1, true, false, false, false, false, false, "185a14940c97342680e89462ca43d5ed5014543000b4c8342fea71520efca058"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 127296, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, false, "df616ce1930bd169e917fabfb47a616888d3c3f1c83cfa2926ffb6c91de52b21"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ128Kv128StaticContext", 127120, 512, 2, 32, 0, 0, 0, 0, false, false, false, false, false, false, "86c30527fd0e335a36b67ef9777748de5ed5482ab420d3a93bb66b9462a68379"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 169232, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, false, "f4d6af8aa521b1e040ef290ca7040be56e4ee8061a78351a1a97d513347c4203"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 167008, 512, 2, 32, 1, 2, 0, 0, true, false, false, false, false, false, "eed022407eea63321a3c9567443bebd10d3e7329ed7ae955ae81014f34d7ca90"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 163744, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, false, "7c00785a939f6dadc9478d123d0d136c50aa2917e9f1b48b190734cce4f70a13"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 162544, 512, 2, 32, 1, 2, 0, 0, true, false, false, false, false, false, "01cabb44bd8cb6a470a09e47cf49acf8fcd81bb409dbe5b0251d7b9f12bff0ef"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 200808, 512, 2, 32, 2, 2, 0, 3, true, false, false, false, false, false, "c9988f2011b107c70ac1f870e9b6163849fbbb910b67ce412241190be889d40b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 196344, 512, 2, 32, 2, 2, 0, 3, true, false, false, false, false, false, "7f0016edeafd091fe4b2afe1dcb39780bdc8f3c643a5824f14fcbbb3a6d9eadb"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 167008, 512, 2, 32, 2, 2, 0, 1, true, false, false, false, false, false, "aa99e4ee714a42121a6ac4dedf725eb75e8dfb68011bbebc8152b77c5912e09d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 162544, 512, 2, 32, 2, 2, 0, 1, true, false, false, false, false, false, "4e1186d369db3808fea023d9dc2d2efdbc02fa04a16117230cea9d8fff38e173"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 127296, 512, 2, 32, 2, 0, 1, 0, false, false, false, false, false, false, "de2292bfba37d40f4dffdf2480cf63390bdd38e0830d9f7b05d559d74f63be5f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 127120, 512, 2, 32, 2, 0, 0, 0, false, false, false, false, false, false, "5bae89ed1f4cef4e55505191bb6b997bc8034a85ccc00d9145ea24122119d0a9"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 169232, 512, 2, 32, 2, 2, 1, 0, true, false, false, false, false, false, "5ce9ef833309d25a2c4ff478a66c03a8cec651aa10534d5184c52d386f49c8aa"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 167008, 512, 2, 32, 2, 2, 0, 0, true, false, false, false, false, false, "8c09f1167ff758a0e4341f011bf03b97a58bc86c4b9d4695fe38bd8dce47b7ce"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 163744, 512, 2, 32, 2, 2, 1, 0, true, false, false, false, false, false, "719bd998e1396e61af2422d5ac5ea76cbc32718da04cad53dbdb7e0323152100"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 162544, 512, 2, 32, 2, 2, 0, 0, true, false, false, false, false, false, "be182aa08935cd3306bacb8f9c3ac95a596142c721c138df622f0ecfc4c6c612"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 224656, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, false, "43de0acd2e27f9e00887f3ad33687f205715f9b1b9516fd23df8ade7548f6060"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP32VarSeqQ128Kv128StaticContext", 224480, 512, 2, 32, 1, 0, 0, 0, false, false, false, false, false, false, "0a9d70a4e3867634537ed4e74c4dae93b56594346a1ef0cd702b361324dbd652"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 213608, 512, 2, 32, 1, 2, 0, 3, true, false, false, false, false, false, "d8e8e4c41eb5b383fe7b2dfecb5490b552b895b0c410b830e652f63b5bd15035"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 207096, 512, 2, 32, 1, 2, 0, 3, true, false, false, false, false, false, "19d88e45ac74a78a52543edcebe562cbb41c3e827d9e21b34c3db60778cbb448"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 180320, 512, 2, 32, 1, 2, 0, 1, true, false, false, false, false, false, "c06b1dd5f3541e984aaf485cfc5c4b855b6dc60bd35feca0e1ff12adc6568e3b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 173808, 512, 2, 32, 1, 2, 0, 1, true, false, false, false, false, false, "c69b434d1a5110ab406152e8d72223619baee22684cf938b3f38496fbb8fe793"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 224656, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, false, "64e2a8835cda4c237bb18b416a7802184864c9f1c2e196c334093f5b0e9eff0d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ128Kv128StaticContext", 224480, 512, 2, 32, 0, 0, 0, 0, false, false, false, false, false, false, "e8bbe33434c86f4400379bbf6a8a339dd02afe1d0c3be92f559c226a737ecf7a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 182544, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, false, "71e0f9d2ad8221fdd306649745f21acb958236756c469957921c69a201ccd2ca"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 180320, 512, 2, 32, 1, 2, 0, 0, true, false, false, false, false, false, "e788053fbeb9fcc36e29babcb0a774a073eeaa648ab98130db8946451cdf9a78"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 175008, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, false, "4b9c6c389bfe11b6fc418b231b413a63c5f7ecb0bdd7d2e4bc37b37acde6ad2c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 173808, 512, 2, 32, 1, 2, 0, 0, true, false, false, false, false, false, "6c4559a81ef8fed329100e119caf1d0313083303aae1a29d9efb95f17248c12d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 213608, 512, 2, 32, 2, 2, 0, 3, true, false, false, false, false, false, "a5732846db4c4d074796d033839d8d6cf96974e7779cd9c05c2cbd171bc602f1"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 207096, 512, 2, 32, 2, 2, 0, 3, true, false, false, false, false, false, "9704ab2572674af41223809210957e1118d2d85111b635e41cca5e6e879481cf"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 180320, 512, 2, 32, 2, 2, 0, 1, true, false, false, false, false, false, "578c62cf81b1808a5868ba62fb2ddcb6a977a30c26c0cc2f509d1318cac14b0b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 173808, 512, 2, 32, 2, 2, 0, 1, true, false, false, false, false, false, "c3970f89101c56963f8046403c6fed28d1570acc01d4de7bb0d6f5af14665e37"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 224656, 512, 2, 32, 2, 0, 1, 0, false, false, false, false, false, false, "3e995d00cd9784f74b14808516223bbef3e4396d5f15161331e9fa1df73a42ba"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 224480, 512, 2, 32, 2, 0, 0, 0, false, false, false, false, false, false, "c7d4c900c8e27404591e64a93123581a864aa34f0a672c4990faf4467de00ad1"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 182544, 512, 2, 32, 2, 2, 1, 0, true, false, false, false, false, false, "2bc234c9545da3281abb4ba2b6c68404696897226fd74e423ab4e566ff1423ac"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 180320, 512, 2, 32, 2, 2, 0, 0, true, false, false, false, false, false, "539f53b21fb03f826b4af455771063fa24878ee89084d32761c6607e2ebce3b6"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 175008, 512, 2, 32, 2, 2, 1, 0, true, false, false, false, false, false, "d63b7de7c4e174a3e907d71cbf8c6b54aed035a6e4a505df97a71076b8bebfb5"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 173808, 512, 2, 32, 2, 2, 0, 0, true, false, false, false, false, false, "f9bdbc9815bd67d04fa1c16691bba41f341ba0926317cb73e236e642a25d8662"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 64832, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, false, "538f2e9647aa8849256621cd7c9ec673b3b3477efa559f45394df27f03c25423"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP32VarSeqQ128Kv128StaticContext", 64656, 512, 2, 32, 1, 0, 0, 0, false, false, false, false, false, false, "b1ac15019c02a9a7e79bf529f0d09ae617e3e158f20a4d44385337ad09e2b94d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 158840, 512, 2, 32, 1, 2, 0, 3, true, false, false, false, false, false, "92d42e11f1a07ec84e4caf46410c8570d1c69b389fc83a752f52abd02fd58b99"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 155384, 512, 2, 32, 1, 2, 0, 3, true, false, false, false, false, false, "4ebbe595d650a19ebddf0bc2581c49160ca48269c7ef4bc47bf9d1a62952b05f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 124016, 512, 2, 32, 1, 2, 0, 1, true, false, false, false, false, false, "dfb51e194877b1cbfd4411adaf41e974e9d3fe398c122d7ad109fd6ff852d18b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 120560, 512, 2, 32, 1, 2, 0, 1, true, false, false, false, false, false, "3a82dbe23e28b81246c4dbcc705cb01d0085dc118c28e94c9c2dbf3207229011"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 64832, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, false, "02a3da985ae0b9ad894dbaf99b2b90ff43c1483c1b878524f0e23332ffa3cf1e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ128Kv128StaticContext", 64656, 512, 2, 32, 0, 0, 0, 0, false, false, false, false, false, false, "d58be12a89d904180df2c77bcca43070c71aefb5a883f62a5ad3276a8a9923f4"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 125216, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, false, "5ae6087bb71cb800d1714e03c37097a6082a51b90295bd4c8bdd6c2b855806ea"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 124016, 512, 2, 32, 1, 2, 0, 0, true, false, false, false, false, false, "86815baa97f49ea8fb7469b6b244e611fa5eae1ce6549a00dea2cc9b99c3a01a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 121248, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, false, "ce5516daede0c1b4eb151f9094964db314a8db918cd891db09d4c432bba32b19"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 120560, 512, 2, 32, 1, 2, 0, 0, true, false, false, false, false, false, "97f5b88c23ded5733134a3d081e1cb75a08462d386afafe5d8f9175eb8505079"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 158840, 512, 2, 32, 2, 2, 0, 3, true, false, false, false, false, false, "24db69790c85e655e839a89d2f78e9d2276578487b42fcac366e6573952f49a4"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 155384, 512, 2, 32, 2, 2, 0, 3, true, false, false, false, false, false, "cc906eada5d0e8c9dc0ce697029de660ec336ae4f6beb2d62778f9915d0a1179"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 124016, 512, 2, 32, 2, 2, 0, 1, true, false, false, false, false, false, "5796aa0268144af191403a95fed456438221fbeef89c7dc712025afc07a8cc83"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 120560, 512, 2, 32, 2, 2, 0, 1, true, false, false, false, false, false, "e78273ca6d9e58650d47989be6756880e41833592180a0c421cf28604806bf43"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 64832, 512, 2, 32, 2, 0, 1, 0, false, false, false, false, false, false, "1ca4dba30d6bf455a1ca70e3014a73d11d51713f8265cbf0757bf7bfd59f4a80"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 64656, 512, 2, 32, 2, 0, 0, 0, false, false, false, false, false, false, "15892fecd42d25155680886a96ec26a4aa038b86ed5ba758406658c1bb8011ff"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 125216, 512, 2, 32, 2, 2, 1, 0, true, false, false, false, false, false, "d688801036fae7d8eacbddf1d87f43dfafad1cbae124ebf36126125691e12771"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 124016, 512, 2, 32, 2, 2, 0, 0, true, false, false, false, false, false, "416c5e53ce53330d068d9ef3cbe959eabe6720440771ff5a06e0b6ed34a261d8"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 121248, 512, 2, 32, 2, 2, 1, 0, true, false, false, false, false, false, "488a7e0f8d73eb627c51295766ac76f7d54d72e3672a49ed4e71f83de2461bd9"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 120560, 512, 2, 32, 2, 2, 0, 0, true, false, false, false, false, false, "6a3b59b4ed0d67f50ab6a282364a4696ed93c429ceb9463451bffa7fbd5aafe6"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext", 164288, 512, 1, 0, 1, 0, 1, 0, false, false, false, false, false, false, "72283a0ae785fce1c83706bad4f38df306b1ca10c119738eeb3389956c4c61bd"}, { DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqQ128Kv128StaticContext", 164112, 512, 1, 0, 1, 0, 0, 0, false, false, false, false, false, false, "33f66c47ed556bf7e3db131516361515aff1c1aecc804ec99bbc86d26dba0d40"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 164224, 512, 1, 0, 1, 0, 1, 0, false, false, false, false, false, true, "d60777598cdb2974a6c74c265b5b0b94bb1d8e1f4432460f5cac0632b4523bfc"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 164304, 512, 1, 0, 1, 0, 1, 0, false, false, false, false, false, true, "59541e9aa4347d12c2f7cba5f3481be952a1f04a9d98f35cd8d75723dc7b68c8"}, { DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 164128, 512, 1, 0, 1, 0, 0, 0, false, false, false, false, false, true, "da4b9dc0920d5f14e8ed36cf9335523c1bba191346fe340ddd2d208cc8250535"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext", 164208, 512, 1, 0, 0, 0, 1, 0, false, false, false, false, false, false, "63ea1b5460fd3c7a8f4c54e6095b8d55a9aa89b15cc9fdf62e37c3ed32f9b3bb"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext", 164288, 512, 1, 0, 0, 0, 1, 0, false, false, false, false, false, false, "fedbf6d2fd9dff9ce29f04140ef157b4c737ee63087d72bd40422b8e8fb075a3"}, { DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqQ128Kv128StaticContext", 164112, 512, 1, 0, 0, 0, 0, 0, false, false, false, false, false, false, "3be4fa914f1e509ce24fec71d7aa7424787c6336dee5ff719ff07deb59f67db6"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 164224, 512, 1, 0, 0, 0, 1, 0, false, false, false, false, false, true, "a2aac8e514f725791080ec8f9f5391338557029e31e8109824eedc9e8bdda672"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 164304, 512, 1, 0, 0, 0, 1, 0, false, false, false, false, false, true, "e64ebce31f8d47d7202fed635a996806e919bdd40601cccb6cc3385566d7011c"}, { DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext", 164128, 512, 1, 0, 0, 0, 0, 0, false, false, false, false, false, true, "1cc07bd593fa27dc2672f7a3b88cd26fa72841798c12c490f5dde32b46dfd377"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 164208, 512, 1, 0, 2, 0, 1, 0, false, false, false, false, false, false, "5b14e2cc4372e78763572e2899db40225b7c5f7b71d64e504974602cbd3a3f49"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 164288, 512, 1, 0, 2, 0, 1, 0, false, false, false, false, false, false, "162b090e96d180d3919e20ead354f22bd8572991a726efc54899fb732a655714"}, { DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 164112, 512, 1, 0, 2, 0, 0, 0, false, false, false, false, false, false, "f271f539020ad41788b83759b39aadbe20a3ea9cd01f164db9d5ba4ca52bf3a2"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 164224, 512, 1, 0, 2, 0, 1, 0, false, false, false, false, false, true, "3bd9196e7daba5362f9f9e5371d2f9c988f7eaf4d20b4ed9579c9ab65905120a"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 164304, 512, 1, 0, 2, 0, 1, 0, false, false, false, false, false, true, "9cf31a516c824d06f6ca68d95fed5242f7cf9ed8e99d9904551f33d9f6fcaf58"}, { DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 164128, 512, 1, 0, 2, 0, 0, 0, false, false, false, false, false, true, "769f62615d2ff88d0197df4e75b23bb490aa71347e6c861241851a6fe4a5a805"}, { DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 164976, 512, 2, 32, 1, 3, 0, 3, true, true, false, false, false, false, "c3313c043dd1f4c3d3a2e049ac023c9a048b7f597ddb00674252ddfb42c28877"}, { DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 183400, 512, 2, 32, 1, 2, 0, 3, true, true, false, false, false, false, "46aef632c88e6769b6da686d33cb69acad48af723d61795f029c65490d4eebb0"}, @@ -4144,135 +4960,163 @@ static const TllmGenFmhaKernelMetaInfo sTllmGenFmhaKernelMetaInfos[] = { { DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 32, 128, 32, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen", 167008, 512, 2, 32, 1, 2, 0, 1, true, true, false, false, false, false, "b00dc41d7129b8f19c55bdc214716c6fbb5ab8bbc6648e15dccb2c9660e4f66e"}, { DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen", 148576, 512, 2, 32, 1, 3, 0, 1, true, true, false, false, false, false, "dabf054b473eb0ad5965c88d00b940574e455927ca3b68b45d0db89a3d78986b"}, { DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 140896, 512, 2, 32, 1, 2, 0, 1, true, true, false, false, false, false, "5a71a8c875af42c6d4bb19606f5ffe085c36a7ded853d4202c346c08d011d88e"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 165072, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, false, "c3f88494e887af9a23a46e91942a88fbaeabfb48bd86f9a63dabe5eb3aa2635d"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen", 197824, 512, 2, 32, 1, 3, 1, 0, true, true, false, false, false, false, "2f5fe5a2fa4c993769e8952d09aee2ab229dcb4b194c37ab77d77ec6b8c454cb"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 165152, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, false, "88495f23e32f79dac6c6cbaf0392c5631cb903c41acce92168ed7946e33f8d76"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen", 197904, 512, 2, 32, 1, 3, 1, 0, true, true, false, false, false, false, "65265ed843abca30ece1cbe9118a5c8fc0e15fd4174b284b4b1c7298c51c5dfe"}, { DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext", 164976, 512, 2, 32, 1, 0, 0, 0, false, false, false, false, false, false, "506ca4a998019c8977b3130eb8fc73959d4bdc8d0416e9270a5304592cc49716"}, { DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen", 164944, 512, 2, 32, 1, 3, 0, 0, true, true, false, false, false, false, "6271b520279913acba35bf49884d9065dddb24d631e3a2815eb0bd62df13afa8"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 153792, 512, 2, 32, 1, 2, 1, 0, true, true, false, false, false, false, "5840e4f9cd51e3f5ac75825bc28677564bb859860609a26832560c16c05dcfb9"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 153872, 512, 2, 32, 1, 2, 1, 0, true, true, false, false, false, false, "63a5910b84fbc2139c39a60c95e2469f3895da1829df14d50a5afe503d67ff87"}, { DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 149600, 512, 2, 32, 1, 2, 0, 0, true, true, false, false, false, false, "db6cac5f5329a8a57471fe54a625c4c57c612696e85d3704d290edc19b6252fe"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 32, 128, 32, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen", 175296, 512, 2, 32, 1, 2, 1, 0, true, true, false, false, false, false, "991b36e7da62d04bbcc6e4bf614737e2a17282dc1582f3723edb3a3bb7786ca3"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 32, 128, 32, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen", 175376, 512, 2, 32, 1, 2, 1, 0, true, true, false, false, false, false, "3e884f8fb94f7e6993a599b38a653c24cd3154bf7bd27e8e5297de0e7b68d1b5"}, { DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 32, 128, 32, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen", 167008, 512, 2, 32, 1, 2, 0, 0, true, true, false, false, false, false, "5aa20be590bf09fae4379c76e00a1d58df5afc88a25abee51994c7f8c9bdf4cf"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen", 165056, 512, 2, 32, 1, 3, 1, 0, true, true, false, false, false, false, "b61a1590270dd920aee4664a748dab52d977c73a1c25c7860381d3640351e131"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen", 165136, 512, 2, 32, 1, 3, 1, 0, true, true, false, false, false, false, "44db523709fafbd3388a1f6f217ac9ade032421e1791ed0eab656c49a01ea238"}, { DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen", 148560, 512, 2, 32, 1, 3, 0, 0, true, true, false, false, false, false, "e6f7b700abb3bfdefc3709441db649001c88be31d137d195c6fed12be069afbd"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 143040, 512, 2, 32, 1, 2, 1, 0, true, true, false, false, false, false, "bdf317182723def4e39a71d727206050ff6e0afb07d7fd6fdf58036ef8f9ad65"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 143120, 512, 2, 32, 1, 2, 1, 0, true, true, false, false, false, false, "8b2c8423a3c285a01cb47c0c6046430c8fa066bba90491ac88e6cdb6f882b5a2"}, { DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 140896, 512, 2, 32, 1, 2, 0, 0, true, true, false, false, false, false, "4e79b6c5e26b146ae445b5b8aaf49fdc15187b209b2dbd4ee0be43446a36285b"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 165088, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, true, "8b63835f3f3a79177ab50e6911a75b657a9214b178391bc188b63513bc4a2803"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 165168, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, true, "649706a034792a7c79fc8180290396da45e21816c8cb4d6f4c30dacb4d64f4e8"}, { DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 164992, 512, 2, 32, 1, 0, 0, 0, false, false, false, false, false, true, "c3a4135927d1430539b117b60336dec562b07116d1f0a5b08c711eedf535f43e"}, { DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 164976, 512, 2, 32, 3, 3, 0, 3, true, false, false, false, false, false, "b47f9ef7050f02dfe477aaf62092572bed24ddf855ebdb9e2ade77d3ea70969f"}, { DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 164992, 512, 2, 32, 3, 3, 0, 3, true, false, false, false, false, true, "1998ce656a413fce9bc2faae5bb6479f7e696da7fb122217354e260be146a68b"}, { DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 164960, 512, 2, 32, 3, 3, 0, 1, true, false, false, false, false, false, "3f07bd9b9e05a379e9e6c5a319804e17cd25b7f4f9083a07b5ef5ac642a7784a"}, { DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 164976, 512, 2, 32, 3, 3, 0, 1, true, false, false, false, false, true, "189384239fc9b6f94083518f3bd5eb557b7f00e533ac156c3142e4d2b4496369"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen", 197824, 512, 2, 32, 3, 3, 1, 0, true, false, false, false, false, false, "7fa2ac29bd69e30b49b41c19e49e4d556355bb0e2575979451650aea4722f9bb"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen", 197904, 512, 2, 32, 3, 3, 1, 0, true, false, false, false, false, false, "1ae518e3cb884ad25d382d7937be62bdc658964d2f596f4fea85cab61bf69947"}, { DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen", 164944, 512, 2, 32, 3, 3, 0, 0, true, false, false, false, false, false, "d0fbc8abdb980ab5fe32ad61c700bdbfc3b0ae27c2a650d9db82ff5995e28ae6"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen", 197840, 512, 2, 32, 3, 3, 1, 0, true, false, false, false, false, true, "b15a9d57d538356e1193629cc2c46b96547dbb5ab28d2ca73464466861b9e059"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen", 197920, 512, 2, 32, 3, 3, 1, 0, true, false, false, false, false, true, "f1bce2ff55ae86cd317a5cba0259f31e592a45996fa660d6d66495816a27cda1"}, { DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 164960, 512, 2, 32, 3, 3, 0, 0, true, false, false, false, false, true, "f6bc0a63969e12ae102267c8b84b92abcfa4f8fa22a740b204942c80accac265"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 180408, 384, 2, 32, 1, 2, 0, 3, true, false, false, false, false, true, "bd3dd67dd1034341b0b7457956ad9bdedef1b92734871c64770e5b9d0bbc086b"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 173624, 384, 2, 32, 1, 2, 0, 3, true, false, false, false, false, true, "336bcce947bd5e1b125a7fe6014c986f41a0584bc292a0de83c6d19b43bdac38"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 146608, 384, 2, 32, 1, 2, 0, 1, true, false, false, false, false, true, "c411e7455a8c58befb4f3d7e648f91355627133e8dd843102461b9080f56a962"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 139824, 384, 2, 32, 1, 2, 0, 1, true, false, false, false, false, true, "a779698308a8a8b3895206151d2b8ca33594ecf624514e1559ebaca5dad5d7e8"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 165072, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, false, "e3c11b9440719c2cab765390861ff0a2501f438854ee50693e04c47e8433e4dc"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 180408, 384, 2, 32, 1, 2, 0, 3, true, false, false, false, false, true, "466e067a3dcfbcbb46cd582602bd2b80d3cf920f608362533b081d190d81f6cf"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 173624, 384, 2, 32, 1, 2, 0, 3, true, false, false, false, false, true, "ba74e14543dd04695470f1624b0c733ec064595b2b06a119c8207be7f257183d"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 146608, 384, 2, 32, 1, 2, 0, 1, true, false, false, false, false, true, "4bad7593089c03cd51043ae9bbf81cf1fd467d112b4fe7d5d752d7f3ebbc99d3"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 139824, 384, 2, 32, 1, 2, 0, 1, true, false, false, false, false, true, "f2c9f76783012006a2a580289fbe3b0b74c52be0995b7c2d486998e2f614b544"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 165152, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, false, "02d08998474e615d09afcded18ae00ca6815631eafa42b989bb208bb60c44db9"}, { DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext", 164976, 512, 2, 32, 0, 0, 0, 0, false, false, false, false, false, false, "ad0b4312fec61c3b4b66cc7ddd1fb75e0763a2e54847116948b5d39b63f36153"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 165088, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, true, "b2fb95fe1727997700bc5d6f123e381dc3eda7d7b5b70d2979f566c20957ad6e"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 165168, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, true, "5cda23bb13bb9eea9197a2ca2ca8f0a9bb8f49e4eafb0550e619a72901d466ed"}, { DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 164992, 512, 2, 32, 0, 0, 0, 0, false, false, false, false, false, true, "2499380de0198268ab63d3d2715b6c5622628183a90cdc7fb655a7cdb8e56f67"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen", 150800, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, true, "d4a3a330e08627c6cabe526fb86926873ea9f0a64e55da0b25619c5f2548bc0a"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 146608, 384, 2, 32, 1, 2, 0, 0, true, false, false, false, false, true, "9886cba79b7113a4dd416ad567543da4183e36dd648d52efd2a960a4b23af631"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen", 141968, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, true, "56332faa20a8bb63362b164e918c42d65736dc4ce5f591bb676f95668521bd42"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 139824, 384, 2, 32, 1, 2, 0, 0, true, false, false, false, false, true, "124d820940e85523c88bc94510cd1c3b92c7e2ecb0f9163c2394e7b5a2025a70"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen", 150880, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, true, "1fdc746b58dd3d5f82fbb309f05dde7381ef92cde51818000d600c54d3c374b5"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 146608, 384, 2, 32, 1, 2, 0, 0, true, false, false, false, false, true, "fb024db67a5967c7c990a1f3f82a88c2e881a7c9f27185154855c3fb4c3c08d2"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen", 142048, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, true, "20c8bc4fe27bfd1d45e4b86b01ed99999e36b0406666b7457ae7756476e027df"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 139824, 384, 2, 32, 1, 2, 0, 0, true, false, false, false, false, true, "acd66ed35f6a0656a0bff227c9ec3f72b10530a810938a73ba599f0b422ec043"}, { DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 164976, 512, 2, 32, 2, 3, 0, 3, true, true, false, false, false, false, "d7cd24522d90e16a7e8c01e37820bf5c2574bed314a42b32837de7fbec2d3f47"}, { DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 183400, 512, 2, 32, 2, 2, 0, 3, true, true, false, false, false, false, "ec180f2caa8c9a9b9238c8d9494e45696aa7e5f7a61aff645ba50a43fad31054"}, { DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 32, 128, 32, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen", 200808, 512, 2, 32, 2, 2, 0, 3, true, true, false, false, false, false, "e7ae44be15e8d1ea69a4e85ee6cd34c6ec184d98d18be6c29fd45ec9291b8ced"}, { DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen", 148592, 512, 2, 32, 2, 3, 0, 3, true, true, false, false, false, false, "7d43fc02c12af6095ba8b5677c8903222d6d37388c9a465b9989c4229f0bf40c"}, { DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 174696, 512, 2, 32, 2, 2, 0, 3, true, true, false, false, false, false, "f850ef50e66212bf0cbdbae56fd5681fc7b0ae99050c11a1933349b80239b79d"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 180408, 384, 2, 32, 2, 2, 0, 3, true, false, false, false, false, true, "6d9b94afe9b6ae36332029003c97809587fe31ab91ccb0b22f53ba24dfa3320c"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 173624, 384, 2, 32, 2, 2, 0, 3, true, false, false, false, false, true, "ac0d6c284b62aa38fae8217b6a43bbe255239f13211c84540482212cf35c0084"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 180408, 384, 2, 32, 2, 2, 0, 3, true, false, false, false, false, true, "006610593ac402c1132dda49ab7dc7456f9da243611560c05f499817d43f159c"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 173624, 384, 2, 32, 2, 2, 0, 3, true, false, false, false, false, true, "9d26f7c1d68bfaceb14bfa6b809ba9d8226bf232c1efd074a815800bd0dca373"}, { DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 164960, 512, 2, 32, 2, 3, 0, 1, true, true, false, false, false, false, "9685018c96400d06382aa17e41172f4b05ff02f7beff8a59eab97313e36bf792"}, { DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 149600, 512, 2, 32, 2, 2, 0, 1, true, true, false, false, false, false, "84a2a2ff6a5e084071030d8037a5936015a5534c107f58e64c37277ae05e4b92"}, { DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 32, 128, 32, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen", 167008, 512, 2, 32, 2, 2, 0, 1, true, true, false, false, false, false, "6441a0d53a6da6720446775fa423419b664af68b6f3aec0d71aa2a9d1bf071ac"}, { DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen", 148576, 512, 2, 32, 2, 3, 0, 1, true, true, false, false, false, false, "d838395c277393ac0139004ba51b923bf4919c981ccea0ed213481849e39d967"}, { DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 140896, 512, 2, 32, 2, 2, 0, 1, true, true, false, false, false, false, "bed7c1e5ebb7bf24d08cb69d5453419d77530f43b96a78502cd9813ba88a2cac"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 146608, 384, 2, 32, 2, 2, 0, 1, true, false, false, false, false, true, "af54e293eb305e64fa7b16a1c34e1965f55dcb76b09f8970f4db81c253aa20ac"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 139824, 384, 2, 32, 2, 2, 0, 1, true, false, false, false, false, true, "9d3396722e5ec5db48af9105cc018686e18594e8402acbe9ded5347a7406a959"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 165072, 512, 2, 32, 2, 0, 1, 0, false, false, false, false, false, false, "a8401955c75abef334bbfceae5ab651b4de92a7d20a66526bf7b18e3e5c426ac"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen", 197824, 512, 2, 32, 2, 3, 1, 0, true, true, false, false, false, false, "71c44d405a431de99066d065d784533a3ae5c7e8cea24c5ef3f2cfbdc84fdd2a"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 146608, 384, 2, 32, 2, 2, 0, 1, true, false, false, false, false, true, "e11a219723d79fb9e1cc9d263282f51842c766ae5be9988444917b9612c6b98f"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 139824, 384, 2, 32, 2, 2, 0, 1, true, false, false, false, false, true, "11d1ad7f0b8a7558d7e0231a715c92a397958be54ade0a099b7de94823462e15"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 165152, 512, 2, 32, 2, 0, 1, 0, false, false, false, false, false, false, "e041e3ec2650a5ccdfe63ae9021d55978663581ee1bde7a709fd5b6f50646316"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen", 197904, 512, 2, 32, 2, 3, 1, 0, true, true, false, false, false, false, "be13b0e0cc44908f930323fd7c1b11d50f0df4705c03dc3379f833cbdd54bb9d"}, { DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 164976, 512, 2, 32, 2, 0, 0, 0, false, false, false, false, false, false, "e1904203d31396bc3666c73cb78cf2d8dc2d6c4fcc9b141cbc9053d61d91ac5f"}, { DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen", 164944, 512, 2, 32, 2, 3, 0, 0, true, true, false, false, false, false, "bb4e1a5222975e322ec00bc98808031b9dc3d52aab03671adf66ba75badc839b"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 153792, 512, 2, 32, 2, 2, 1, 0, true, true, false, false, false, false, "17ecdc74a5a9a29bb713a42082e8d27d7b6f432153de1161db6ebbf92a35e007"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 153872, 512, 2, 32, 2, 2, 1, 0, true, true, false, false, false, false, "6a4037efbd506ea3f173d1f39e6bed1c5d8239dba28b2d3fd885e9b1092d087b"}, { DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 149600, 512, 2, 32, 2, 2, 0, 0, true, true, false, false, false, false, "ad16abdf376ce2e7a15af536b67986a0b048c8f0f938080524c6ad2a68ec22ae"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 32, 128, 32, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen", 175296, 512, 2, 32, 2, 2, 1, 0, true, true, false, false, false, false, "8ad81d752df00c840ec2c76e3d9e7aa3c19ab92e0b93a856644a9df2789a76db"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 32, 128, 32, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen", 175376, 512, 2, 32, 2, 2, 1, 0, true, true, false, false, false, false, "4991f9687e1964f24db37b4f3f0a69cb97233e49b87ccfe561a40539b718d153"}, { DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 32, 128, 32, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen", 167008, 512, 2, 32, 2, 2, 0, 0, true, true, false, false, false, false, "64cab0db9c7e5ea47d949d6324eb52dbd3c2fa928603f47c65476088efd67d06"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen", 165056, 512, 2, 32, 2, 3, 1, 0, true, true, false, false, false, false, "9da26b1b01b2d370435afe30cd65a258818f21e8b1c29163cec848c5f4eede72"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen", 165136, 512, 2, 32, 2, 3, 1, 0, true, true, false, false, false, false, "ac5d89b9ca791e4d6096bbf353c369833b20f60f8f5103e1b8eb842cca3a5f24"}, { DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen", 148560, 512, 2, 32, 2, 3, 0, 0, true, true, false, false, false, false, "186c49e53d6c2406c507b7ea027c06a1e5c11e271b09243a16c8ab8f478d74e7"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 143040, 512, 2, 32, 2, 2, 1, 0, true, true, false, false, false, false, "d41eb6b05d9fa6095aadf90176b3a2b841ca6abe199a8715ad17613b46e9197a"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 143120, 512, 2, 32, 2, 2, 1, 0, true, true, false, false, false, false, "51e92da4618d346f36e3fe117ead9ebbe758f39f73bf90cabaa53f27776bd989"}, { DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 140896, 512, 2, 32, 2, 2, 0, 0, true, true, false, false, false, false, "e2ed2b5c07711791e491e4a31b51815d2cf65b92087ea39b354da047682fd863"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 165088, 512, 2, 32, 2, 0, 1, 0, false, false, false, false, false, true, "60b10bd3de8f4174cfa254466bcaee256789eeed649b99456d78a347947117bb"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 165168, 512, 2, 32, 2, 0, 1, 0, false, false, false, false, false, true, "cb44ab1ab37754a1aafc124cb5a266ecbbeb8f17f18f2b3ccf604cf4042f9597"}, { DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 164992, 512, 2, 32, 2, 0, 0, 0, false, false, false, false, false, true, "f60d16d0e50cf915c3ca1be63a3b40ed0120f3f587dc1b80ebaf72740bb24f2e"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen", 150800, 512, 2, 32, 2, 2, 1, 0, true, false, false, false, false, true, "0c845ded7159f738f59d30af7076cf4deb1df957f3534f12566ba51a9dd63ba1"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 146608, 384, 2, 32, 2, 2, 0, 0, true, false, false, false, false, true, "2ba58e0f9f649d30ec12ba0256e6ec496ecb76e4adb6cdd8f59c2aa61af105a1"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen", 141968, 512, 2, 32, 2, 2, 1, 0, true, false, false, false, false, true, "ceccd4390b8de3665e3f899d1bbc72899d4ffb8280fc02b25f838a1342fb4f03"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 139824, 384, 2, 32, 2, 2, 0, 0, true, false, false, false, false, true, "382f6d8b3f512949b38e9aded9d0bccd2b6ee29362dcadbca687f04b9f828b29"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext", 196928, 384, 1, 0, 1, 0, 1, 0, false, false, false, false, false, false, "beeebf87d016ebfb0bc80fc26fbd93c93f772bf7d923491bbe85387e7349c4af"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen", 150880, 512, 2, 32, 2, 2, 1, 0, true, false, false, false, false, true, "121b17ce33cbcd732210831f8fcb7260633dd0d14f9cd994befb7bf622b52263"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 146608, 384, 2, 32, 2, 2, 0, 0, true, false, false, false, false, true, "c4efdd623cec0d878fd9188777e4046b320a447a0c656b4ec4a5b4d6ea9c1aa3"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen", 142048, 512, 2, 32, 2, 2, 1, 0, true, false, false, false, false, true, "af44651cdd759ec88fb799c7339f9473cebe9c92e888ff4cf6bdfda5dbbecc46"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 139824, 384, 2, 32, 2, 2, 0, 0, true, false, false, false, false, true, "306f3e05317a576bed3cbc7483426aa9cb1488603e14255f6fbcef7913f1f576"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext", 197008, 384, 1, 0, 1, 0, 1, 0, false, false, false, false, false, false, "7fc1745772ec9f72806dcad826f67cbdf946a8dbb69acc807bdbf5c6954de20d"}, { DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqQ128Kv128StaticContext", 196832, 384, 1, 0, 1, 0, 0, 0, false, false, false, false, false, false, "4df7859e4c99274af78d5d3e36e39cb26d59cdbd6ad787e6703b100f0b895e31"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext", 196928, 384, 1, 0, 0, 0, 1, 0, false, false, false, false, false, false, "bc98be235e5af7b409d1a4ef758eeeea08a93096f35b1056397035bc676d4bf5"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 197024, 384, 1, 0, 1, 0, 1, 0, false, false, false, false, false, true, "f69060adf5000681348cde28ca46df558feb3e2ec818bf6398ee93172df44574"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 196848, 384, 1, 0, 1, 0, 0, 0, false, false, false, false, false, true, "a3cdb54945a1f3074cbcd6e7252590d1ff07953e3571d0083937a56846faea1e"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext", 197008, 384, 1, 0, 0, 0, 1, 0, false, false, false, false, false, false, "f33df87a78d2c29d2d3a705d6e25ae6c8b5297b9e0e758998342266b6477a4b4"}, { DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqQ128Kv128StaticContext", 196832, 384, 1, 0, 0, 0, 0, 0, false, false, false, false, false, false, "5da834d304585ebb1c4d066e1c2332743c700ee46badc9c5ced265295b8fcabf"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 196928, 384, 1, 0, 2, 0, 1, 0, false, false, false, false, false, false, "3944fcb46dcbacbce002887efa389bda17948704377420f05a98dfcf44122108"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 197024, 384, 1, 0, 0, 0, 1, 0, false, false, false, false, false, true, "901dde0f8e5ab3232b5319ecc6ae7edb3d668e177c7434032d0d55e28f64e0e1"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext", 196848, 384, 1, 0, 0, 0, 0, 0, false, false, false, false, false, true, "1c983be7cb136e3f6cb80e7d048dd30cbd34204a4d2215b38a933749a742b771"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 197008, 384, 1, 0, 2, 0, 1, 0, false, false, false, false, false, false, "3ee162765b6efce0965426f17e8a03b442b2d0def00b67c089670d2be67a1474"}, { DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 196832, 384, 1, 0, 2, 0, 0, 0, false, false, false, false, false, false, "b8935ad0347b386795cb9e9db0dc67ff922281d3792d7b810d1d32a7285f79cc"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 197024, 384, 1, 0, 2, 0, 1, 0, false, false, false, false, false, true, "224c813335f58efc68f1cb3042397f8bd04c097f582e0c107100928ee59bc950"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 196848, 384, 1, 0, 2, 0, 0, 0, false, false, false, false, false, true, "a659a4855ddd397300e0e270f9b85ddd3e78f2197464ee90c9cb4e37a955339c"}, { DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 197728, 384, 2, 32, 1, 3, 0, 3, true, true, false, false, false, false, "9b34e61f65a481e221fa052e435dfe329b7652e4b176f212e029d0006e5e2b3f"}, { DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 191080, 512, 2, 32, 1, 2, 0, 3, true, true, false, false, false, false, "0b63deed1216292f4580f809269ea252a5e14a0d87e42ab213824e253cd34a02"}, { DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 32, 128, 32, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen", 216680, 512, 2, 32, 1, 2, 0, 3, true, true, false, false, false, false, "2e7c2c1d0e0c661db990c35546fff096dc96047aa440443d6ad5258f163f4288"}, { DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen", 164960, 384, 2, 32, 1, 3, 0, 3, true, true, false, false, false, false, "206cc391dded08e986735314bc8029a8181d277e798ce2ffb9af5ed3879b639e"}, { DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 178280, 512, 2, 32, 1, 2, 0, 3, true, true, false, false, false, false, "f3c0ec8863aefa97df944b2ebc1c85aad07769e963c5f3f8d0e1b2f85047585b"}, { DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 197712, 384, 2, 32, 1, 3, 0, 1, true, true, false, false, false, false, "1b9df995a7fea743cd6d0aef8c8da40cb60ef5a4b1fc9300b00b5a8125dc9510"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 157792, 512, 2, 32, 1, 2, 0, 1, true, true, false, false, false, false, "1e8409052f4c1f4b8e1944b4b085af613e5560e08524fd5f69e7628fe1524476"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 157792, 512, 2, 32, 1, 2, 0, 1, true, true, false, false, false, false, "a4f5ca2608441720bad60883f3bfc4e3c5f594e3e2f823740a0c51c849c33e18"}, { DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 32, 128, 32, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen", 183392, 512, 2, 32, 1, 2, 0, 1, true, true, false, false, false, false, "4783f0942051a940c916e620ef24ddd9e849a7d61f39d47fd19bd5f91710521a"}, { DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen", 164944, 384, 2, 32, 1, 3, 0, 1, true, true, false, false, false, false, "3fe7a93a2753d7f1dec4e2cea9754249f94d7af6f7878d9e8c4a5034d3ee8b96"}, { DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 144992, 512, 2, 32, 1, 2, 0, 1, true, true, false, false, false, false, "9ed4d6124d63b203a31edaf101af6beab333c127051e4f4e33447f9fe5fff4c8"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 197792, 384, 2, 32, 1, 0, 1, 0, false, false, false, false, false, false, "989f301da253c7f1697c27b2d957551a5dc96d7fc55674b29fa116bb5f4e8f5f"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen", 197792, 384, 2, 32, 1, 3, 1, 0, true, true, false, false, false, false, "dfd7f8325bb0bb51455c9d76c6a5f753b9777f0f079c07e03c9d1a7b92763f2d"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 197872, 384, 2, 32, 1, 0, 1, 0, false, false, false, false, false, false, "285bd1b47ae3b9f11d48718e5041c3280e2b4f7a77533ba8be1959d7980484d0"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen", 197872, 384, 2, 32, 1, 3, 1, 0, true, true, false, false, false, false, "93afbb7efe1d36c50a2ad15867ff7ae459d84bfe56de3fa73d5f9ef388943f95"}, { DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext", 197696, 384, 2, 32, 1, 0, 0, 0, false, false, false, false, false, false, "cd4fab82cd6158b3d92b1e9f75993e66987d145a29e0cd34a5a21c6afd9dd712"}, { DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen", 197696, 384, 2, 32, 1, 3, 0, 0, true, true, false, false, false, false, "2da2b356a4b730bd3185811371db7fcbff7d1fbbbafc25df6703451ef6fde0f1"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 161984, 512, 2, 32, 1, 2, 1, 0, true, true, false, false, false, false, "cdcf6e7a0db2c355bb8acb74bbae0768a875a9fb4c2032e391ede3f080450ae0"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 157792, 512, 2, 32, 1, 2, 0, 0, true, true, false, false, false, false, "ffc486f2dbc4431dd96fe35b744b4734a330287dd6bd71799b8125eb28e08eb1"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 32, 128, 32, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen", 191680, 512, 2, 32, 1, 2, 1, 0, true, true, false, false, false, false, "e96a6a179682fc743c6c5f9e29b470e8075a312a00e9195bf8ff7c3be9d7241e"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 162064, 512, 2, 32, 1, 2, 1, 0, true, true, false, false, false, false, "e496a64241d05cfeef8647708086d42f623a96de53a7b0f441afb10772f0d7e2"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 157792, 512, 2, 32, 1, 2, 0, 0, true, true, false, false, false, false, "43f3be64cf2ee8b56806b1f459f57b3ff5c2c4c4801e8de92555ba6fe4038758"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 32, 128, 32, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen", 191760, 512, 2, 32, 1, 2, 1, 0, true, true, false, false, false, false, "eaefd03a76e8d8b72f07d5940160dd64c93fd96d4e013be63db645b2695840b0"}, { DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 32, 128, 32, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen", 183392, 512, 2, 32, 1, 2, 0, 0, true, true, false, false, false, false, "2001a724fe68802082c5d9d75176c38902a4d1537a18954ee9cc8bc1a6704d17"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen", 165024, 384, 2, 32, 1, 3, 1, 0, true, true, false, false, false, false, "9c7ce77ab32ca32cd0640a1fa2d3a25c4b7fdd226139f866276e1556a1214ac3"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen", 165104, 384, 2, 32, 1, 3, 1, 0, true, true, false, false, false, false, "c0afdc5fc1e1f56ea20c5b5d3a6834a7173ac1841b92b4c8cf3352d1fdb9a664"}, { DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen", 164928, 384, 2, 32, 1, 3, 0, 0, true, true, false, false, false, false, "9ecdcc12ca6881822dd3a9efdb469adb4ab7d813b874f7d98f69c68807b60c6d"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 147136, 512, 2, 32, 1, 2, 1, 0, true, true, false, false, false, false, "a17e8dcc453a17134f7f3a62c04311c688c989389a729f7beb8eacf3ce68cfca"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 147216, 512, 2, 32, 1, 2, 1, 0, true, true, false, false, false, false, "538c4395b0291f203afdf625bfc4de0f077092e757b5b4d29fd9dc2da4891688"}, { DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 144992, 512, 2, 32, 1, 2, 0, 0, true, true, false, false, false, false, "211d0be2555d93c92720743db5a43664357f2d4550b1b062709b4af6b76a1b5a"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 197792, 384, 2, 32, 0, 0, 1, 0, false, false, false, false, false, false, "0b64acea7fbf24f183d4bcc24308df3ad9efc04f48f0fff67991fb85de222416"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 197888, 384, 2, 32, 1, 0, 1, 0, false, false, false, false, false, true, "6d78288097d5f14e963f9c70f62ddd238a97913dde1d4d8e0caf5af19cb484eb"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 197712, 384, 2, 32, 1, 0, 0, 0, false, false, false, false, false, true, "a060bee6b09f776d1157141cd936ca917f839763f46be2702efb0825a64c6062"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 183992, 384, 2, 32, 1, 2, 0, 3, true, false, false, false, false, true, "936ae7b86f275f7b8d71a5e7f8fa3e55e0aa6b16c754076daa97546f03f093e7"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 175160, 384, 2, 32, 1, 2, 0, 3, true, false, false, false, false, true, "8c9861ec6114d54eae8424cf81eb7aa2d5d0f748797036fc1da5e8e1855ebbf3"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 150704, 384, 2, 32, 1, 2, 0, 1, true, false, false, false, false, true, "86aa0c7699eed10c882dacf873ca3f82ac64e27bdd52dc23257eed3d2506a326"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 141872, 384, 2, 32, 1, 2, 0, 1, true, false, false, false, false, true, "b005506be94a79eb7334cc22a196655216afd7bdabce7002181daf7dbea1669c"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 197872, 384, 2, 32, 0, 0, 1, 0, false, false, false, false, false, false, "d497638df6a150794abfa464dce1272c6993504950baf615cf7306bc0881d172"}, { DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext", 197696, 384, 2, 32, 0, 0, 0, 0, false, false, false, false, false, false, "2093f6ea4fb071e65ddb99962a237468ddca660237cc45ff8ac65b72c8bcb9da"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 197888, 384, 2, 32, 0, 0, 1, 0, false, false, false, false, false, true, "cfa969684bf3655ecd7de03cfac3df32a300f46b14387e07b331e1ae24fd112b"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 197712, 384, 2, 32, 0, 0, 0, 0, false, false, false, false, false, true, "90a215b9f24f82df061f2a285d8be160987ad4bf5b5cf93297e5b1ddba4fb664"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen", 154976, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, true, "7ad670c11803915cb3e81410e3ad8af6d43c25dc97fffc0953f2c8d0233a489b"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 150704, 384, 2, 32, 1, 2, 0, 0, true, false, false, false, false, true, "f6e59521beac470857859d47a7277132eba4779c117e44257cb08b8086f6523d"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen", 144096, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, true, "f7ce9fa4e8aac345f2cc631458ab3f9624d0eafafe82aba58f8722eb471984fc"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 141872, 384, 2, 32, 1, 2, 0, 0, true, false, false, false, false, true, "0d7e27f0051f4b387b96d6bbaa76774a9cd5dcdd74dff7c886c5638faac36bfe"}, { DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 197728, 384, 2, 32, 2, 3, 0, 3, true, true, false, false, false, false, "cc1bc1f20915e75af4dd7f768b3e4df5555979e47ac71bc04340e21f7c6113cf"}, { DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 191080, 512, 2, 32, 2, 2, 0, 3, true, true, false, false, false, false, "3ef735ab6a81895d9514a56081462330b99cffa49c3ed1439ced57a3ece0eb0d"}, { DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 32, 128, 32, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen", 216680, 512, 2, 32, 2, 2, 0, 3, true, true, false, false, false, false, "290d42cd7b9e340b6f8e68d79a014db0e136d651873a8eef0c03d477ff19622d"}, { DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen", 164960, 384, 2, 32, 2, 3, 0, 3, true, true, false, false, false, false, "66cf1e8fa0852f9069cb7e9d57425dc72ce857a2d61e0dcca481e5e2f4df983a"}, { DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 178280, 512, 2, 32, 2, 2, 0, 3, true, true, false, false, false, false, "a1a5fd764f266bfdb23cca7b2f8b1d0d90fb30f3abadb969165968efd228a232"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 183992, 384, 2, 32, 2, 2, 0, 3, true, false, false, false, false, true, "aae62f3e14bc22ae6c0944fc6890bcf620d97402cae2992cd38ea11a78b9bb40"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 175160, 384, 2, 32, 2, 2, 0, 3, true, false, false, false, false, true, "9545a67b9d0a310393e2cb9d84ed38485eba4a52476b67d8863033ac2dff4da8"}, { DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 197712, 384, 2, 32, 2, 3, 0, 1, true, true, false, false, false, false, "d11f019bc5bc98145cdb85a29c98c46bb03367ba2e1d83f99c147e65e3df56e2"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 157792, 512, 2, 32, 2, 2, 0, 1, true, true, false, false, false, false, "52308c23d4ab773df8d47f163a0a02715bc05e86ab7782c28b6598bf06f64e3c"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 157792, 512, 2, 32, 2, 2, 0, 1, true, true, false, false, false, false, "9b5e3025b208392c93d31243ba865532bf2600454b25e620675ada2376bcf4cc"}, { DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 32, 128, 32, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen", 183392, 512, 2, 32, 2, 2, 0, 1, true, true, false, false, false, false, "ef37e432226b683bf09b2d792e5cce47f105951a99d76f2f6c9f30ed70024954"}, { DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen", 164944, 384, 2, 32, 2, 3, 0, 1, true, true, false, false, false, false, "f5eb67a47ee0ffc3bf3fedf68641f3f67b586261efe4ee87b0b36b1270378999"}, { DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 144992, 512, 2, 32, 2, 2, 0, 1, true, true, false, false, false, false, "f70a90927ed5b56c4c0d6305af4077d733558177ef47aa51aa7227d268ba59bc"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 197792, 384, 2, 32, 2, 0, 1, 0, false, false, false, false, false, false, "0e7ccf1f0fb4eedeb4eb0e7eeb2e9e7665b6c93c1bf4aa55b4f0ae02c9bf4df4"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen", 197792, 384, 2, 32, 2, 3, 1, 0, true, true, false, false, false, false, "98d9e53fabfe3dfda353dc6c6f1189e28e5e74544a483439b780e6e38f8b3991"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 150704, 384, 2, 32, 2, 2, 0, 1, true, false, false, false, false, true, "e451371981d99ec10007b957666845581ef1ce16ab04a4810c5fe949b0d39ce1"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 141872, 384, 2, 32, 2, 2, 0, 1, true, false, false, false, false, true, "218cd63f2797eeda9fd45f8cabe916dbd7b3fd3d52dfb28c24d6d8aa2e445212"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 197872, 384, 2, 32, 2, 0, 1, 0, false, false, false, false, false, false, "a1b06f82e3c41f4f90b1f6dda22d9cb846dc228685bfa41537797b1987d06266"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen", 197872, 384, 2, 32, 2, 3, 1, 0, true, true, false, false, false, false, "bbe8cb31e6433a3c2e8d4c9ee4bdefdd4492dc830103e85e8bf439794232fc98"}, { DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 197696, 384, 2, 32, 2, 0, 0, 0, false, false, false, false, false, false, "2da1c171dacba2488427c06d4eee49d9b297cd0ae3126f2118bd7d37eef9198a"}, { DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen", 197696, 384, 2, 32, 2, 3, 0, 0, true, true, false, false, false, false, "f694e7765661c95cbcbefd6b0fdadaa79fee161b2711f1d0246648457865ed45"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 161984, 512, 2, 32, 2, 2, 1, 0, true, true, false, false, false, false, "569285b435e8144a03262778417cb2cc336f19f49206a410fd4df7a59f07cde8"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 162064, 512, 2, 32, 2, 2, 1, 0, true, true, false, false, false, false, "9fcc3323193a2877c85c614511a4f0633429c590105ad1ca72313e040426fe62"}, { DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 157792, 512, 2, 32, 2, 2, 0, 0, true, true, false, false, false, false, "ca38b61fb3e4bab503437e7e65d16c74b73943ee3fef4339abe3772367fe70ab"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 32, 128, 32, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen", 191680, 512, 2, 32, 2, 2, 1, 0, true, true, false, false, false, false, "3776737f49b82bba009c8f190bb3c6b0455da0d98e85e62d76bae3522a31b383"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 32, 128, 32, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen", 191760, 512, 2, 32, 2, 2, 1, 0, true, true, false, false, false, false, "f66f73fdb8313a9a74864f2631e8527a34fb1752d207aebdbf0822c8cd4abd8f"}, { DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 32, 128, 32, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen", 183392, 512, 2, 32, 2, 2, 0, 0, true, true, false, false, false, false, "377dd02e2de39a86d14c0bbdebae961a56f984b243aff3f70122fc99abac8ea8"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen", 165024, 384, 2, 32, 2, 3, 1, 0, true, true, false, false, false, false, "b772881931f7e6eb85018de5e9b48822d9615aafee7e0205bc08264e248c12db"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen", 165104, 384, 2, 32, 2, 3, 1, 0, true, true, false, false, false, false, "aa3eb676868c4c62789877a82a9412431be38aa4014c4586dd806d205b7f1f27"}, { DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen", 164928, 384, 2, 32, 2, 3, 0, 0, true, true, false, false, false, false, "3bdf916a2acd06d499e8d34f566e2cfa6d6d2a3216af6e4e389f1b3588e2e8d6"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 147136, 512, 2, 32, 2, 2, 1, 0, true, true, false, false, false, false, "b91cbeb8a96b47e87b3a5513f46ae9ec939a2bc6d9f5bdb48e186c9faba3c9cf"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 144992, 512, 2, 32, 2, 2, 0, 0, true, true, false, false, false, false, "578664608faf4664fff9cbfe0a0076f98085052a170a672db70c13dc89713a76"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 1, 0, 1, 0, false, false, false, false, false, false, "703a932871a26cdac53c6a9ec58dc8532d31efa878563010f9261f5761963815"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 147216, 512, 2, 32, 2, 2, 1, 0, true, true, false, false, false, false, "b7e83e308c03253024bc474e681ea4901689c9658f7e0986f7874c314edd2d51"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 144992, 512, 2, 32, 2, 2, 0, 0, true, true, false, false, false, false, "e7b3c0e9ca7abd3f9ad3fce96c60af5262fc46f35f0555c25d85e64d859a0775"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 197888, 384, 2, 32, 2, 0, 1, 0, false, false, false, false, false, true, "4bff280c610bb4a0977c571395e5714788383fc0da4f6f705ec81b26e212f9b2"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 197712, 384, 2, 32, 2, 0, 0, 0, false, false, false, false, false, true, "496c2835e1eb01aebcbdcf6d45b114c3e6a70983fdcbfac33b7074baac5f1f2e"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen", 154976, 512, 2, 32, 2, 2, 1, 0, true, false, false, false, false, true, "2d6e60da111582b890435d9b44fb94ff4fcb98dc00bdd8f661143b97a9e0d128"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 150704, 384, 2, 32, 2, 2, 0, 0, true, false, false, false, false, true, "097de5ab6296c665488d78a6e8c0582227086a93945726d07adbc4e7d7f41757"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen", 144096, 512, 2, 32, 2, 2, 1, 0, true, false, false, false, false, true, "d93198f91697f85c7388d96544fb582742e13fe57864a18cb79b1c7c0487af6a"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 141872, 384, 2, 32, 2, 2, 0, 0, true, false, false, false, false, true, "99a9c8b5a3d8d5ab147e9b64ec0b6cbc946c17e52aa98ce5aa4e4add6d8fdd56"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext", 82336, 512, 1, 0, 1, 0, 1, 0, false, false, false, false, false, false, "15243603fcc13db2f4458b9c30d7c98f6425fd5ac8cc982c862f68806fc38969"}, { DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 1, 0, 0, 0, false, false, false, false, false, false, "fe43c68ccee09a29a28490931152633b468f7fd747ea85d784af289b28ebef80"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 82272, 512, 1, 0, 1, 0, 1, 0, false, false, false, false, false, true, "9c69b08b4573f4de49d12f5beecd21a55d5926062e7451162adddd35d82e0a22"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 82352, 512, 1, 0, 1, 0, 1, 0, false, false, false, false, false, true, "dfa607ca5490ae6836fbff1783dadf52feaedf2cf16c46b2855dfd01110ae07c"}, { DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 82176, 512, 1, 0, 1, 0, 0, 0, false, false, false, false, false, true, "a47dcd9b859d55bc8862382503a1e608b1b459e324644e964cc274f526182976"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 0, 0, 1, 0, false, false, false, false, false, false, "54b0482903463a5001b308bdf927bb92a1f9a6b498db37b71599e79de4db7bc7"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext", 82336, 512, 1, 0, 0, 0, 1, 0, false, false, false, false, false, false, "b81e25532cf35dbcf3495d7612c685f4a9106e1d2dcda032d7fbc2b1fbaed8d4"}, { DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 0, 0, 0, 0, false, false, false, false, false, false, "13ec97d95d644d77db7b9360c6e9c0759dbc00df98337df19243ebe3e789165e"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 82272, 512, 1, 0, 0, 0, 1, 0, false, false, false, false, false, true, "d5b2692b03450e26cf3e18687b79163675f8f9a5ad27ea4988be85d0529d9a37"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 82352, 512, 1, 0, 0, 0, 1, 0, false, false, false, false, false, true, "087c037d81dc28ba3455a3cec4b6a8f338e86ca2e701f3352a0aab709e161132"}, { DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext", 82176, 512, 1, 0, 0, 0, 0, 0, false, false, false, false, false, true, "b4240d308737b76a4456ae6065f891262a9352ae224d5655688e6d36811e0264"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 2, 0, 1, 0, false, false, false, false, false, false, "548cdbf7d0c55b49180d495271a5d398c9d46658e5e4521d511e7be3334057db"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 82336, 512, 1, 0, 2, 0, 1, 0, false, false, false, false, false, false, "eb48bbf096247c39644d0d8a23843912db6d231c35a888ba35b9475cd7d33c0e"}, { DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 2, 0, 0, 0, false, false, false, false, false, false, "85c004b8afe737af03a9da7c0f2cab6ca6980e44c12876b67371c03641a443e2"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 82272, 512, 1, 0, 2, 0, 1, 0, false, false, false, false, false, true, "b56d99f235ebcc5c07fe5353673415a4b2712e32ee1c308bf851689324fe5b6d"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 82352, 512, 1, 0, 2, 0, 1, 0, false, false, false, false, false, true, "46a89c70271f25709a9486fa14cdd786d8579a281b98df3942b6cfbe9cfce83f"}, { DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 82176, 512, 1, 0, 2, 0, 0, 0, false, false, false, false, false, true, "cbfc7d37011fe342e8edf8b04ae2ea0f26c3997be3a3116e2ce5bc422f415b99"}, { DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 165056, 512, 2, 32, 1, 3, 0, 3, true, true, false, false, false, false, "ccb4d32ff1b9b2984f8d539cf504033a993cca7c0dc0bdfd76b08614a1f423a8"}, { DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 196792, 512, 2, 32, 1, 2, 0, 3, true, true, false, false, false, false, "1f8e1508f8d5be78e095ed8cb921f759b48564620b7d13466ecfeac7f4ff7512"}, @@ -4284,182 +5128,254 @@ static const TllmGenFmhaKernelMetaInfo sTllmGenFmhaKernelMetaInfos[] = { { DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 32, 128, 32, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen", 175280, 512, 2, 32, 1, 2, 0, 1, true, true, false, false, false, false, "c001f23d78a8ac09225b1e5d49ae59c2e17d61099cd129dd69e86af819823b51"}, { DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen", 156848, 512, 2, 32, 1, 3, 0, 1, true, true, false, false, false, false, "f7c3695328c1b2e5cfab2bdbe87448fcb899304d214bea56dc54a317a93b5769"}, { DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 32, 1, 2, 0, 1, true, true, false, false, false, false, "3b65280e596c808e40f13ab61f5e54f8c5c60f9915174062521e67517c9c446c"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, false, "00808638e861787dd8f26b20c5cffd0ba56bcfdfb5d3b8ad2c7f533376e2cf66"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen", 181520, 512, 2, 32, 1, 3, 1, 0, true, true, false, false, false, false, "43be45611b028329414669c1ef50fef61f2b802a92421379cca74f1174ad7d1e"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 83200, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, false, "999b7e14117d35b3863355ac0bcf18975bf5eb87ef1a6e0563daf2cb5d70e8ca"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen", 181600, 512, 2, 32, 1, 3, 1, 0, true, true, false, false, false, false, "c383c023f38e792c3c1a96f5642bec0ab148e2e535fac973b476daf90ed179ce"}, { DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 1, 0, 0, 0, false, false, false, false, false, false, "8b82966d5b54a1486cd450386cb59008603504d736137d89c39a86c68df17767"}, { DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen", 165024, 512, 2, 32, 1, 3, 0, 0, true, true, false, false, false, false, "7a865f92583239b4ab471cbb909d2e5291fa239e621d7fd06da65d21a30a1606"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 164112, 512, 2, 32, 1, 2, 1, 0, true, true, false, false, false, false, "f3df953bddd7adb9e2e39165885a9999661be1c552b66d9981ed73a43452e875"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 164192, 512, 2, 32, 1, 2, 1, 0, true, true, false, false, false, false, "d00d1e7cb351fc9db76f9b426fe8cd42e816b1053c9cf8884e1b0ba22754cf03"}, { DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 32, 1, 2, 0, 0, true, true, false, false, false, false, "8d29d2bcd3ca61cc942a593c677969cbb3ae3c589aad4b4a8caa42aa18396740"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 32, 128, 32, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen", 179472, 512, 2, 32, 1, 2, 1, 0, true, true, false, false, false, false, "bef40396a0e106aae8d92260934ca083b160822609d5874201bf5623fe1b569d"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 32, 128, 32, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen", 179552, 512, 2, 32, 1, 2, 1, 0, true, true, false, false, false, false, "1fd68cdab7619473fccaa41a660c1e2d68f38209b7b059e6a95427c0441438f1"}, { DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 32, 128, 32, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen", 175280, 512, 2, 32, 1, 2, 0, 0, true, true, false, false, false, false, "a100cfa5ae722378566847f0d3bd65b7e93010012f37857f9031220cdbb0f30f"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen", 165136, 512, 2, 32, 1, 3, 1, 0, true, true, false, false, false, false, "8e72cb54efebc227cdacd5441a3a166728e0be5e882703805c6a751633d058d6"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen", 156832, 512, 2, 32, 1, 3, 0, 0, true, true, false, false, false, false, "caf6b0881a1f6cab545dfefe7904c823490dd26025fc7f4779f88621ad56611f"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 156432, 512, 2, 32, 1, 2, 1, 0, true, true, false, false, false, false, "fde53b010679a1ca35d36a5b125518e5291cb50257b32bc67514694e537ff116"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen", 165216, 512, 2, 32, 1, 3, 1, 0, true, true, false, false, false, false, "47fdf6e8ca22ce1c723a2920a5a1ac11c3f7731dc3dfd106c1dedd719ba11419"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen", 156832, 512, 2, 32, 1, 3, 0, 0, true, true, false, false, false, false, "a2839f55024b9505d834d8d43fb6e4286d4f7053594c145d2f55f015e0759acf"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 156512, 512, 2, 32, 1, 2, 1, 0, true, true, false, false, false, false, "2c8f04915501cc5b74b3ca59e67cae4d125111fcd294f1013459b7803d46d43b"}, { DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 32, 1, 2, 0, 0, true, true, false, false, false, false, "a1a8541c7719b7b67bf55549436ec2f42341bd23815ca2a75886506db151577f"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 83136, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, true, "67eb4eb6513980ad0ad4c8c143844ae93b4b314405cb64a58896346d9fd38696"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 83216, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, true, "c51033cf412c63debdadfecb8204f79d20d9cf8038eae59e7f824bb03b2cc245"}, { DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 83040, 512, 2, 32, 1, 0, 0, 0, false, false, false, false, false, true, "ba3581982deb59c85b86a891d9fe2fbc028be397f7053c50cc8114509433f806"}, { DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 165056, 512, 2, 32, 3, 3, 0, 3, true, false, false, false, false, false, "472bbb35f16af8d558af1a0e1898335b0f40a5631269c7361e468c70a592abfb"}, { DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 165072, 512, 2, 32, 3, 3, 0, 3, true, false, false, false, false, true, "edb70de2220a9eea6f3225fd481ceee048e81cdb565d9e04dae719d072bf76e2"}, { DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 165040, 512, 2, 32, 3, 3, 0, 1, true, false, false, false, false, false, "b67cc85002dfa5b5d9fa42a895058919ccb7511a4e5c36757b2f7e280f4f0472"}, { DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 165056, 512, 2, 32, 3, 3, 0, 1, true, false, false, false, false, true, "c12e2d90246553b5d3650904321f81a5a73fb94510d610ca4fc84dd738707f6b"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen", 181520, 512, 2, 32, 3, 3, 1, 0, true, false, false, false, false, false, "56149611c4fcb8d9d03c5604eee00ced4163bf6b412f966f3b4209c5dca6c8ed"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen", 165024, 512, 2, 32, 3, 3, 0, 0, true, false, false, false, false, false, "74fe249f718dcd5a61dfb633cc5ac42a93e90f2ac15cd1becd79f7ead2a20f91"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen", 181536, 512, 2, 32, 3, 3, 1, 0, true, false, false, false, false, true, "5c2b9fc4464eefcaafc9bcfd1aed72f17dd9d5151b86362829b834e61bdc4531"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 165040, 512, 2, 32, 3, 3, 0, 0, true, false, false, false, false, true, "810bb9a7b5fe0653d2fef34a3a68e04159fb5f51467602cf3710b5baf67141a0"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 195848, 384, 2, 32, 1, 2, 0, 3, true, false, false, false, false, true, "4dc890e165897c12927beda1bd66ab2c3cad4df44c95f4514fa13870d1b3015a"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 190088, 384, 2, 32, 1, 2, 0, 3, true, false, false, false, false, true, "a74dca1fc79d3e0db0bcfbe0378d63c18dba0f2a9ec1b5cb0af8604874f21d15"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 161024, 384, 2, 32, 1, 2, 0, 1, true, false, false, false, false, true, "7fbcfd583a193977deb080ebdeb22f53d3d9db9138eae70114fa57a2d262c76f"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 155264, 384, 2, 32, 1, 2, 0, 1, true, false, false, false, false, true, "066401b6f3133cf2945bf1efcbc5527e400696bdc2df5b5d5255429a88dc0ff7"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, false, "ffd9b37335a8da948522056401ad5640d5e4d69ac9876225911aa11dfd476a4e"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen", 181600, 512, 2, 32, 3, 3, 1, 0, true, false, false, false, false, false, "f15b44a45d1d7a8078676334d36bfb621e533e57d5f7d686770f82cbdd65863c"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen", 165024, 512, 2, 32, 3, 3, 0, 0, true, false, false, false, false, false, "91785d4b7a8c64a30576302cd011b4ef756f7b57d37d97a47fa8adb7ee47d1e8"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen", 181616, 512, 2, 32, 3, 3, 1, 0, true, false, false, false, false, true, "fc1234fc2944da5dddd4eaaca2e73076b39d621c0df4658d232a72f67aec08f7"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 165040, 512, 2, 32, 3, 3, 0, 0, true, false, false, false, false, true, "921d420efd6f16cf49dbb2fa62c83e32832a74ed25fe466153a2265c7b5b29e0"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 195848, 384, 2, 32, 1, 2, 0, 3, true, false, false, false, false, true, "06b90adc53adff3f87682d83ddf073fd5e18e98e6320703915a156b57e5f04ad"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 190088, 384, 2, 32, 1, 2, 0, 3, true, false, false, false, false, true, "9eb138eee33d5f6a4c38e1213e00ebebfb49737048977b226b135a014025082b"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 161024, 384, 2, 32, 1, 2, 0, 1, true, false, false, false, false, true, "c80fd7348339b0c114a418d388796af0a690540f75e016f41616a7625d9ad5f8"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 155264, 384, 2, 32, 1, 2, 0, 1, true, false, false, false, false, true, "147400a03b73e613f26e6d7a9cf492b8b4f3db305c239b5bc9fb90c1eb868bc8"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 83200, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, false, "0fa041a06ef85b02eab81202e84b21df79a877fab3a03cd41af7c9fd16459a98"}, { DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 0, 0, 0, 0, false, false, false, false, false, false, "e4d955c736f74fbc7c23dfd0be6eabc190ab4809b5475eca59182eeb411b9931"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 83136, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, true, "bde4a112c02c6bb28ddd5badde39a1f6cc63d5b29ddbc62775dac1b0f9db2c25"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 83216, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, true, "0aad7e1e82ce993d410ad51a7aa9d75fbeadde6d85de94fbf5e9aff778e383c2"}, { DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 83040, 512, 2, 32, 0, 0, 0, 0, false, false, false, false, false, true, "626cc32c4bf1f0edfe6e1b9ff3f6019534bdf6f7129522b5d1ed9fb9561bc0fc"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen", 163168, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, true, "94067b3f041eff8f25c2be1d5a641fdb1faf39436c5ff66e5e343cd8824880b6"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 161024, 384, 2, 32, 1, 2, 0, 0, true, false, false, false, false, true, "a3befbb9ffd28f987f687f7dd032109e34ceff313ef84b3ced6b4021477799a6"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen", 156384, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, true, "044dcf2e074ff4ef58720cd01e3d2356c12170dfcf74abb951672c1b5df2d70d"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 155264, 384, 2, 32, 1, 2, 0, 0, true, false, false, false, false, true, "8879847961f7ed91c7ca43d3216a3e487c96ccb4f8785dd6e40401faeb5f9f9c"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen", 163248, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, true, "b3c4e167411b6cef88ba3031f15d23d3f82f49fed117f551ee25d734a9491d02"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 161024, 384, 2, 32, 1, 2, 0, 0, true, false, false, false, false, true, "c5efadee6e5626ba33ab0171405b0d455c1f0f84889ee9a6119eaee930eda283"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen", 156464, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, true, "04661289df9031920fddef1552d51e05bfa0bb24f578301a4ec9e9cf58baf998"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 155264, 384, 2, 32, 1, 2, 0, 0, true, false, false, false, false, true, "fea97bffe36a53cb0e685bbc264d1b7b19a05a37eddba335da044e3802d5f239"}, { DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 165056, 512, 2, 32, 2, 3, 0, 3, true, true, false, false, false, false, "673ed15dfb65cadc03c62342ef063407224cac6fb5b5768781b3e0b9759172ca"}, { DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 196792, 512, 2, 32, 2, 2, 0, 3, true, true, false, false, false, false, "496b1615f2b38050829710842f1386659b60619ef7b0733bbbeae2527f7cebd3"}, { DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 32, 128, 32, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen", 210104, 512, 2, 32, 2, 2, 0, 3, true, true, false, false, false, false, "47976282391d2e4b9dc541790b317d6b7b5ca9a1d101e6db979873962851836d"}, { DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen", 156864, 512, 2, 32, 2, 3, 0, 3, true, true, false, false, false, false, "bfa34b655ff5f9a678f5bab1be509cbec7d5ac9c2696aa352ec1967bbf19a995"}, { DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 190136, 512, 2, 32, 2, 2, 0, 3, true, true, false, false, false, false, "dd5ed21705d8e9fb7322540a84ddb3e86fe7c8c363f3993ce1b0b1307be644db"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 195848, 384, 2, 32, 2, 2, 0, 3, true, false, false, false, false, true, "e4b976ff703ade5019de1faeb93f1d27144cd58bb97dd3357550b221f9f38998"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 190088, 384, 2, 32, 2, 2, 0, 3, true, false, false, false, false, true, "a654605f56a3ef734b160370fe2592d9a92eb2e4ec04cc4ef1205338b0516ca7"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 195848, 384, 2, 32, 2, 2, 0, 3, true, false, false, false, false, true, "73d07e6bebf378020236c0dec8dc574b9c96969cc7400db59817e12742013ad2"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 190088, 384, 2, 32, 2, 2, 0, 3, true, false, false, false, false, true, "0a8b3df72c44edf8fbf2d28ac49a4c52afa4511728feaa85d9a5648756e9bb5c"}, { DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 165040, 512, 2, 32, 2, 3, 0, 1, true, true, false, false, false, false, "b7298450fbc3d878819e37a664b530605adeae08040caa847e8ee5b21af830a6"}, { DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 32, 2, 2, 0, 1, true, true, false, false, false, false, "8a4884661f6a2533e2aa249e7b7bf25bd5d763886ea16f116c6eb62df69b27de"}, { DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 32, 128, 32, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen", 175280, 512, 2, 32, 2, 2, 0, 1, true, true, false, false, false, false, "42106469431bac234e5ac08603aebfd1c53d887a539f0c539022bbd54f1c3ebf"}, { DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen", 156848, 512, 2, 32, 2, 3, 0, 1, true, true, false, false, false, false, "a4074a11941627816e4bc159e89aa9f3625b69f25ccd9923830be8075eeb22e3"}, { DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 32, 2, 2, 0, 1, true, true, false, false, false, false, "390aa3568923bceb5da217e56df9b3023dc1542cebeebcbcd532418211e4b6d0"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 161024, 384, 2, 32, 2, 2, 0, 1, true, false, false, false, false, true, "7c29a910da3e09a4c8833d9e1e63fd7853bcfab634300818ef4af996339dccb5"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 155264, 384, 2, 32, 2, 2, 0, 1, true, false, false, false, false, true, "6fbdf9af79801a4c06fce0847d06ec3b736184cf0679edabf8592f083160080c"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 2, 0, 1, 0, false, false, false, false, false, false, "328a3401b8cd55662d5f1bd53431a947cad6ae47fa589d311e6ad62f4d19cd38"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen", 181520, 512, 2, 32, 2, 3, 1, 0, true, true, false, false, false, false, "2b4d1b10b83055d26d946f166f1c01dc8c59cefd5a55f71376eceab1f2c5f35b"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 161024, 384, 2, 32, 2, 2, 0, 1, true, false, false, false, false, true, "8079ac282a2184132f1e37e86c7788179cd75db7f3673c8b9c221179f2503f7a"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 155264, 384, 2, 32, 2, 2, 0, 1, true, false, false, false, false, true, "10e071e2421dc7c4566033fff53cc97944f706cfef90c5728d649066f2de37b5"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 83200, 512, 2, 32, 2, 0, 1, 0, false, false, false, false, false, false, "de55b204e744e89be8a2ea77a996a2e82f4b97082ee88bf3b60e1424424eaf0a"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen", 181600, 512, 2, 32, 2, 3, 1, 0, true, true, false, false, false, false, "7a2bfb30076bcc8d96bece4c9cda55333a452992faca891b36bc5da74dbed67b"}, { DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 2, 0, 0, 0, false, false, false, false, false, false, "19b4f6c31652d61ed42f6c8682da4f17b107df1ebd077381543d841366c41b3d"}, { DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen", 165024, 512, 2, 32, 2, 3, 0, 0, true, true, false, false, false, false, "a2ad71c0f7ca98086f2cfff40cf9fbfa67f45918c31fc6f8dbf5c55b378b3409"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 164112, 512, 2, 32, 2, 2, 1, 0, true, true, false, false, false, false, "bac39038ea03f969574ac1fbb234de485947fd5f854ac85cc3459c56547fa553"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 164192, 512, 2, 32, 2, 2, 1, 0, true, true, false, false, false, false, "06ea7fec13c189faf4d52961a300ef06d93cb9ef1d19a7be7bdbf64ba0bbc39f"}, { DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 32, 2, 2, 0, 0, true, true, false, false, false, false, "c40bef75c346c73bee24690de450d558cae007d4a0f7d82c9e586493d6ed8f1b"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 32, 128, 32, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen", 179472, 512, 2, 32, 2, 2, 1, 0, true, true, false, false, false, false, "eb6bc68988eec7bc660b39fbb33cfb0a22ebc0bc955c73fa9669eca27d806854"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 32, 128, 32, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen", 179552, 512, 2, 32, 2, 2, 1, 0, true, true, false, false, false, false, "d8bd9bbc61a0112520dfc085100f78439e4478a7db5174ea8e18dee33422f3e9"}, { DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 32, 128, 32, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen", 175280, 512, 2, 32, 2, 2, 0, 0, true, true, false, false, false, false, "14415864d3b3dce749cd98caf67c289564d50bd7dc96981ffae52388b6bfe0ea"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen", 165136, 512, 2, 32, 2, 3, 1, 0, true, true, false, false, false, false, "7d6492fe1722b09a12ea1fc7024c1ac3ae907370de13a9d011d59d516a6b40b7"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen", 156832, 512, 2, 32, 2, 3, 0, 0, true, true, false, false, false, false, "91044936447b16f0e74fa2eb83943e87bf292ae680ac838cd5b85687474d9c51"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 156432, 512, 2, 32, 2, 2, 1, 0, true, true, false, false, false, false, "22157517280fa657b50160d828b28eb928af5bc1de1f5ed4e37358b88da2afbe"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen", 165216, 512, 2, 32, 2, 3, 1, 0, true, true, false, false, false, false, "96e47f915175390225215afa271d599f64784acc8c836d80771f4e4c2ab5ee97"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen", 156832, 512, 2, 32, 2, 3, 0, 0, true, true, false, false, false, false, "5d66545f31b72ddb7d92aea61ecda58af8bffbdf8235de8abb0da9f77802c25f"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 156512, 512, 2, 32, 2, 2, 1, 0, true, true, false, false, false, false, "973b30717be1e1cc423caaf1e4b405ebd6cc1a1da64ebc4a2cdb8fec21359032"}, { DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 32, 2, 2, 0, 0, true, true, false, false, false, false, "6909196f0aa9da495f9c8cef0d2ece499b08b57059b07b60f2b2f8624a1a918d"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 83136, 512, 2, 32, 2, 0, 1, 0, false, false, false, false, false, true, "b014c384d263bd97fa335ec9125049c25879ed7360ea896f550847c69b1a53f5"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 83216, 512, 2, 32, 2, 0, 1, 0, false, false, false, false, false, true, "51f894922057a8bb519d914278d2e9954248573c1d1fb02d380993231804cc26"}, { DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 83040, 512, 2, 32, 2, 0, 0, 0, false, false, false, false, false, true, "9005fde8e356f64ffa0a06b2e532071e4291570c72057620962875c539c9ff52"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen", 163168, 512, 2, 32, 2, 2, 1, 0, true, false, false, false, false, true, "2db5c973e82838f9b34b49b01fa4384873d4aa7a6978e13f96abf9d8d22ea0c5"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 161024, 384, 2, 32, 2, 2, 0, 0, true, false, false, false, false, true, "6412a16e7f478c563491d1429734060cfadcb83e165f676ddab6e063449884d4"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen", 156384, 512, 2, 32, 2, 2, 1, 0, true, false, false, false, false, true, "252e1ad4bdd81175572c03f8e9bd022c473855aa807b732bcc00f4cc781d2325"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 155264, 384, 2, 32, 2, 2, 0, 0, true, false, false, false, false, true, "22e0e752bcde0a7cafdae066f3e2f60d82630cbc52b522096ef4a0c44e4c2d94"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext", 196976, 512, 1, 0, 1, 0, 1, 0, false, false, false, false, false, false, "3d5383ac1701e294f9bf82510ad9802bbd8e14152e2292a03ce20f3dde801a20"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext", 196880, 512, 1, 0, 1, 0, 0, 0, false, false, false, false, false, false, "fe0ecc8298fdc2067a1e22e6f2cfc274d81dfdb48c17234f5242014dd0eaa81a"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext", 196976, 512, 1, 0, 0, 0, 1, 0, false, false, false, false, false, false, "ec6058153153eb6640796525683ab57b130354e3758fd0c8fe12eda4fc0c1414"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen", 163248, 512, 2, 32, 2, 2, 1, 0, true, false, false, false, false, true, "86cef547df37a5b8e38a7958a9d64bc18560f3373e00297d1d3113e8f088e7ec"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 161024, 384, 2, 32, 2, 2, 0, 0, true, false, false, false, false, true, "f2329541a64c0408fca37a97965736689507d45a392ec20a3df285ee257c12f1"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen", 156464, 512, 2, 32, 2, 2, 1, 0, true, false, false, false, false, true, "1a253a3ed19aa635c356872730667b31a7c4099294a0d66b4ede25335fab9903"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 155264, 384, 2, 32, 2, 2, 0, 0, true, false, false, false, false, true, "e53d6bbab27a77e7e84159e54f24bd3f6a001051ced237d1f732978852f40cdd"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext", 197056, 512, 1, 0, 1, 0, 1, 0, false, false, false, false, false, false, "ea4cbe492422f159f9e65cc8f1dc1cfeb4e62bc7b615b3ad89964bf6788d1606"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext", 196880, 512, 1, 0, 1, 0, 0, 0, false, false, false, false, false, false, "64b3c5b60ff7a272dfba27a2b0ef3124cfcdf705a1c92f68c01e342d1be65052"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 197072, 512, 1, 0, 1, 0, 1, 0, false, false, false, false, false, true, "f93e82b08c9f082679e8d3c6a0fab93d3a36243ab7a055ec0e4a7b511e6fd543"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 196896, 512, 1, 0, 1, 0, 0, 0, false, false, false, false, false, true, "22f002be5dcb09c2a94dbc5cfd1dd6d4627c3910cf5dfcdf8e0c5c0ff2bd887f"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext", 197056, 512, 1, 0, 0, 0, 1, 0, false, false, false, false, false, false, "43915e19973bd13c0223eb8a39356965ec13b5501a77ee72be32e8109087d033"}, { DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128StaticContext", 196880, 512, 1, 0, 0, 0, 0, 0, false, false, false, false, false, false, "2036e38d222cf6f705050922430bfc6c44e79ac9d47b226481ac6af9c12d4847"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 197840, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, false, "0224b8ae90981ed1d5fd060c2e561c7fde37879876094d69ca6fc86fea3500a6"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 197072, 512, 1, 0, 0, 0, 1, 0, false, false, false, false, false, true, "27beb83556819a4fe1e338f8274fb5224a0c9ae4e603673741501f1d022f55af"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext", 196896, 512, 1, 0, 0, 0, 0, 0, false, false, false, false, false, true, "75a5e5099dbbe55b0994f4592793832fa241d1af493366bbd508abbd56c0b0bc"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 197920, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, false, "b680359d9d7aaecc087224b0887b2490c7e994fa5b7b7788db42008a5fda0884"}, { DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128StaticContext", 197744, 512, 2, 32, 1, 0, 0, 0, false, false, false, false, false, false, "17afa7106ed36489e6f46fa4be6b5a4dea53d0c727758d5132d91631928bff0b"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 197840, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, false, "104c89675dac23bae4caa1f7c1c3e43903881f20cab4c1e75922e9c912ed468c"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 197936, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, true, "e3ccbc997aea63bcf6a7cdf22f43e1e10dcc017be0468b8a4e62d10309e20045"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 197760, 512, 2, 32, 1, 0, 0, 0, false, false, false, false, false, true, "2289cb7fd6daa3fe6d13e99b04037921fc74d54808468bf1450b826d7732286c"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 197920, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, false, "62412887f48aef9c65ece54fb4615b4df155505b2e1980976b21f711ee0a78e9"}, { DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128StaticContext", 197744, 512, 2, 32, 0, 0, 0, 0, false, false, false, false, false, false, "080266dfac39ca760d6b6f1a216251c14dd5e10127e7bc45b028674eb11db2b1"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext", 196976, 512, 0, 0, 1, 0, 1, 0, false, false, false, false, false, false, "191d8aaacf3f84ceda040a6f15888d45763a145113a388ca4c3ea965dacb44d6"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext", 196880, 512, 0, 0, 1, 0, 0, 0, false, false, false, false, false, false, "0738c5b3e5e5d3204c05df2542479ec1c5708c69210043ee0d3ac34b968488e8"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext", 196976, 512, 0, 0, 0, 0, 1, 0, false, false, false, false, false, false, "2b2e694cd84a5f34eb6bddf26f556ea1a238b794b45bfb8b1b69e5eb8c3bf3b4"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 197936, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, true, "1c1fb33822ef3d29601e90e95be9e9007e5fc5f48f4662405e3e4f802ebfd513"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 197760, 512, 2, 32, 0, 0, 0, 0, false, false, false, false, false, true, "dd8fd244529335ec17a64bd4f1152bec236af15e3c0848c485a903a5730fcef0"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext", 197056, 512, 0, 0, 1, 0, 1, 0, false, false, false, false, false, false, "d93231797e341d4aaf0bc0b683821f268cef55d6e4bb3e7de8308816e702366d"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext", 196880, 512, 0, 0, 1, 0, 0, 0, false, false, false, false, false, false, "bcdc396b10c02609e2f82666cdbf5c11a99da9be771558848f65f23edd6808a7"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 197072, 512, 0, 0, 1, 0, 1, 0, false, false, false, false, false, true, "1b1a54fbb6bc3028321a107b9ae3767eb78b5bda5ea69da8a85e6a945f671be4"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 196896, 512, 0, 0, 1, 0, 0, 0, false, false, false, false, false, true, "6fd61cfb79bf0d34073efaf09327eb1a591d28ec16871fd8a53e9bf9a01f7f8f"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext", 197056, 512, 0, 0, 0, 0, 1, 0, false, false, false, false, false, false, "79636c04790512c520df59bdf71c4bba10458b02f6489b0625a4d7f7015456aa"}, { DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128StaticContext", 196880, 512, 0, 0, 0, 0, 0, 0, false, false, false, false, false, false, "5d2898527388b081e1b680d3d44948acf758a1fd44873417b57ea3b8c7835a87"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 197072, 512, 0, 0, 0, 0, 1, 0, false, false, false, false, false, true, "7f3372f20353e84cc08265d9068c5a035652c89caa780752a4bb328d6a5e89c2"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext", 196896, 512, 0, 0, 0, 0, 0, 0, false, false, false, false, false, true, "9c0f60fecdb92668e17cfe2bb0dede8d5b3a33773659f63bba813cc185dbf51e"}, { DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 212072, 512, 2, 32, 1, 2, 0, 3, true, false, false, false, false, false, "7a7a3b31ff0fa14e358e5fbfecce46fd73c9370c69f4a9576c27990c94429089"}, { DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen", 224440, 512, 2, 32, 1, 2, 0, 3, true, false, false, false, false, false, "5b412b792ff6654186c1930c0b7ee4e5d29b32a23dae1d46c284e716d98e0da2"}, { DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 189032, 512, 2, 32, 1, 2, 0, 3, true, false, false, false, false, false, "6b84cf8db715aa2401d93daca2b8a5f7611d230838f79c2be08358e1bf0a2e37"}, { DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen", 203448, 512, 2, 32, 1, 2, 0, 3, true, false, false, false, false, false, "94dcf9cd77b2a9cb9921766ca5120ebadb136a8a3f67e598b329362ce220463c"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 194744, 384, 2, 32, 1, 2, 0, 3, true, false, false, false, false, true, "fb0360b57a7b02de73f4d77c781d7994282892ef742d7930ae9157ab9e64008b"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 64, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen", 207112, 384, 2, 32, 1, 2, 0, 3, true, false, false, false, false, true, "1313ca53f6600cbce7796777efa23666ad552725de904b8a064d0f6e9ae93774"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 180792, 384, 2, 32, 1, 2, 0, 3, true, false, false, false, false, true, "b3b7a134b8e039f5f3eb8f2ff2bf99ede0a93d5f1deff70d8244035134049a09"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 64, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen", 195208, 384, 2, 32, 1, 2, 0, 3, true, false, false, false, false, true, "85c947eeca44c1ad63b8f8051946c1b2905463a5d34aac33869767f53ba873d1"}, { DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen", 205904, 384, 2, 32, 1, 3, 0, 2, true, false, false, false, false, false, "62382ea609995e843cf0d6a95a9a8a777718b6eaa86bd2da897785d9fc952afc"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvGmemSepVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvGmemSepVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvGmemSepVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen", 205920, 384, 2, 32, 1, 3, 0, 2, true, false, false, false, false, true, "9797fcdb8e7e1973cdcb9e06a2e7439c87af4f3898dda223ffda026b59b4bc6d"}, { DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 178272, 512, 2, 32, 1, 2, 0, 1, true, false, false, false, false, false, "8270568df7911e3a404601f20ccce6235ad3af9c1b1e288031d6c51cd57cfa2b"}, { DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen", 190640, 512, 2, 32, 1, 2, 0, 1, true, false, false, false, false, false, "2f884634c22d8dfda1f518d3a6c03aabbef25cefd6e50d62fe54ef5b62b6204a"}, { DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155232, 512, 2, 32, 1, 2, 0, 1, true, false, false, false, false, false, "6d1a518794a7677aae46e36184b221019db741e3044aa90cd7dd6199fd53f401"}, { DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen", 169648, 512, 2, 32, 1, 2, 0, 1, true, false, false, false, false, false, "7607c443bc336658fdfa18478baf3404e626656444169d437b6f3eda9b8403db"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 182464, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, false, "bf8fb2c7bf01fee102ec8c251cc27fa284a15706b96441bd1cff7ba954ce7c6d"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 160944, 384, 2, 32, 1, 2, 0, 1, true, false, false, false, false, true, "7ece23b2f61b554fa89fc37c429819db1b38a3351642d5d654094b755c9c450f"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 64, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen", 173312, 384, 2, 32, 1, 2, 0, 1, true, false, false, false, false, true, "b825a1ed7def84ca9f1a4cd0ac9073858bede590311dcdb1fc7ddb555cf6040d"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 146992, 384, 2, 32, 1, 2, 0, 1, true, false, false, false, false, true, "e42d7ce3c9dd23f6ed9af11785b3c026b97902b2954043c8e708ab4976ae74b7"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 64, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen", 161408, 384, 2, 32, 1, 2, 0, 1, true, false, false, false, false, true, "fbd026ebd60396249451fbba209650700094ed62db80c15bdfe38b42189f3422"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 182544, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, false, "5a81ebcbe72a842d4999329df11ac703c88719ce7571da8bab602b69eabd8dd6"}, { DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 178272, 512, 2, 32, 1, 2, 0, 0, true, false, false, false, false, false, "7908b919c4de295333512382a4ef8ca73c4950509cfdc718091dfb4b38c03c25"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen", 194832, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, false, "c730c29fe3dce6dbe0927dd4f6ea2b33c4afa2b5c2b61fe681a573f61a636000"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen", 194912, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, false, "c16547ef2c205998ab041ebc4e7fa0aca3c4cf3503dab13a7d46aa088f016e49"}, { DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen", 190640, 512, 2, 32, 1, 2, 0, 0, true, false, false, false, false, false, "5a61272ece58830fed7bb8f05ceac9b0c41a58687c82cd75ad8c647f0664b91c"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen", 205984, 384, 2, 32, 1, 3, 1, 0, true, false, false, false, false, false, "f90b81ca24d282c6f09321fbca1f5bccd80b764308bc24360f464cfdfd592661"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen", 206064, 384, 2, 32, 1, 3, 1, 0, true, false, false, false, false, false, "8baebd16e53bb118ae04fd2a2e409e64b1471de64d360f8c6c6a5eb0f06ec7ba"}, { DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen", 205888, 384, 2, 32, 1, 3, 0, 0, true, false, false, false, false, false, "7b4dd99edc0c8d38d08f353d9ff8f020e4ac84f08a9efaf29f95ac6508eb44d9"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 157376, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, false, "aeb9ef47cdbf306ec199beecf31863ebb72761e32b02e376b803f07b6c4cb7e4"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 155232, 512, 2, 32, 1, 2, 0, 0, true, false, false, false, false, false, "5c840c8c25d9f53d90d86107e3bdace6112c3c5b5594b857c85ca735da718d8c"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen", 171792, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, false, "ddcc3f65d03f409e9a846af9cc07373f490469e15348929d2c871c2ed79571b2"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen", 169648, 512, 2, 32, 1, 2, 0, 0, true, false, false, false, false, false, "132f1949775e060d28ec5cae2e0c114e22922733416bb5fa4720b29b0dec278d"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 157456, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, false, "76e4311619f5f53c96757f49286dd0134678e41a2320b05ded9babf5715b30f9"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 155232, 512, 2, 32, 1, 2, 0, 0, true, false, false, false, false, false, "6747141ea83456b06779891c2ddc2b9e5995ab7c606774e5de277e9e46e495dc"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen", 171872, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, false, "b709b396f1b8b522ee0989cdcce81c6013887c88d94c392adf06bdd5a7e6f29b"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen", 169648, 512, 2, 32, 1, 2, 0, 0, true, false, false, false, false, false, "0ff28e960d09228d7d6cbc1ac5156b0d7dbca8a97e8fef29c9ae4599dd9946bb"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen", 165216, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, true, "42b1c3cacff21f8ddd56431adfe45ba655c9b0a65e6ba14826094cf347ee0efc"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 160944, 384, 2, 32, 1, 2, 0, 0, true, false, false, false, false, true, "d19c221675304eb42bd4a4b349706336bfb71e39cf42516a31667d31f6235ce6"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 64, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64PersistentSwapsAbForGen", 177584, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, true, "cd8518faa394224eb08c3ec5760e28b19bf108e0bc9ac80b9ce8c5c811443724"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 64, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen", 173312, 384, 2, 32, 1, 2, 0, 0, true, false, false, false, false, true, "1bd58103d19c4dc9c171aa6e9eab7d0e44d7aeff4a8dbc21baf108b98529dcd3"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen", 206080, 384, 2, 32, 1, 3, 1, 0, true, false, false, false, false, true, "ebc2f41181e73ae7c18c0af3a5e286b19786d844ec656d9c37ff49f3298238d8"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen", 205904, 384, 2, 32, 1, 3, 0, 0, true, false, false, false, false, true, "b4770c43d1288d5e0b5189ad0f5360c54313cc25354af865d2cadc22611becea"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen", 149216, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, true, "d06bb4b8181156f31e2b17ef019f8843125a1b384c35cd4a13bcb11f64825023"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 146992, 384, 2, 32, 1, 2, 0, 0, true, false, false, false, false, true, "cc5292801cf3402de8fc2505bd51fd9f6f65f3f7bc670f5c03998b94ceddcd00"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 64, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64PersistentSwapsAbForGen", 163632, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, true, "605326e4a2b1c35990b0d17bd98b297e6cd6babc388d9de7e9ce2eccb33e410b"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 64, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen", 161408, 384, 2, 32, 1, 2, 0, 0, true, false, false, false, false, true, "7801d21237ad5a1cae324a59c6ca41af6af84aee7596f6b0dfcbaf1d38c2f317"}, { DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 198728, 512, 2, 1, 1, 2, 0, 3, true, false, false, false, true, false, "584cea39a27b3c3243efb94c16ba22d308f6d44acc044e66c95276bebed52ddc"}, { DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 185032, 512, 2, 1, 1, 2, 0, 3, true, false, false, false, true, false, "3f185f7d9d6f35e527b6f682d776bde4ba955b91dcb282076879596e7d0839b4"}, { DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 164928, 512, 2, 1, 1, 2, 0, 1, true, false, false, false, true, false, "2bbb46e3ccf6bed2d3864449042be8a4667b787c5e0fa79cbba514661b82c851"}, { DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 151232, 512, 2, 1, 1, 2, 0, 1, true, false, false, false, true, false, "b8debf4217dec81cfb3695bdeac49f39cd7862f510f4d1bec293094309b0c5f0"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen", 169120, 512, 2, 1, 1, 2, 1, 0, true, false, false, false, true, false, "2e2d48a2428cbdcfc92c3d08e59d4f362179856f4d783694711e1849ceb101bf"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen", 169200, 512, 2, 1, 1, 2, 1, 0, true, false, false, false, true, false, "edbc0c58f66f887cfea9785935d5adcd0e11dc82b5dba4dc949282462a0a169c"}, { DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen", 164928, 512, 2, 1, 1, 2, 0, 0, true, false, false, false, true, false, "ef1673126f97feda6fcb8f3c557a2605451b869c50ac795642214625c26dbe49"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen", 153376, 512, 2, 1, 1, 2, 1, 0, true, false, false, false, true, false, "ff023e46b04c84e1a2efa1e393247776f0a8442c0841fd559365596db5829389"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen", 153456, 512, 2, 1, 1, 2, 1, 0, true, false, false, false, true, false, "386137e3a53a678cfbbac263f6b6124a273a07e796495a34f1646c995dc4d05f"}, { DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen", 151232, 512, 2, 1, 1, 2, 0, 0, true, false, false, false, true, false, "5101b91e2c2571554ee057e176e31d9d42094af4e94895d47e4e464daef9f87d"}, { DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 211560, 512, 2, 32, 1, 2, 0, 3, true, false, false, false, false, false, "3a2a20c14667b98e6255c219a0ea31acc977270f1c14cc282d3ef2d06e4a517b"}, { DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen", 223928, 512, 2, 32, 1, 2, 0, 3, true, false, false, false, false, false, "fa85eb52e2d7bbed28d84c46ad599674dd2a09c654a4facf4b27271cd5d505d4"}, { DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 188520, 512, 2, 32, 1, 2, 0, 3, true, false, false, false, false, false, "8ee8c5a475661b91d6be7267fd5aecac133494ba59b02016a643b0b7a1c8e767"}, { DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen", 202936, 512, 2, 32, 1, 2, 0, 3, true, false, false, false, false, false, "02c302f471218c861499a78191b06e0c8c86174846d5c0ddb27de2f23b6dc2fd"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 194232, 384, 2, 32, 1, 2, 0, 3, true, false, false, false, false, true, "7c7febd08de08a94e7cfb2954af1fc05d6ba8a585943153be99f9c5fa3db1ece"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 64, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen", 206600, 384, 2, 32, 1, 2, 0, 3, true, false, false, false, false, true, "bc152a55b2afbb2a39ae49a7d97e7e327468b8e4ea38ee8e682877bb0f30c08e"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 180280, 384, 2, 32, 1, 2, 0, 3, true, false, false, false, false, true, "d6c5af0bf4994f5132c639ddd06f135e761c4157838e7f7e5d54424f6a221a3c"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 64, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen", 194696, 384, 2, 32, 1, 2, 0, 3, true, false, false, false, false, true, "1d7d16470ede90e5aed84c360a245ed29ce00e631774ffb263e2ef8b1c39a9ad"}, { DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen", 223880, 384, 2, 32, 1, 3, 0, 2, true, false, false, true, false, false, "c4c5ab9fb8cbe9234876c37481d298112ea784e649bb9780f4d28c61c25c88ca"}, { DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen", 205904, 384, 2, 32, 1, 3, 0, 2, true, false, false, false, false, false, "41e755de7fe1e71ac38dec5bff76e3aaa4d6dfe6cbe0bb3639724da6ed1254d2"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqSkipsSoftmaxQ64Kv128Static2CtaKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqSkipsSoftmaxQ64Kv128Static2CtaKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqSkipsSoftmaxQ64Kv128Static2CtaKeepsAbForGen", 223896, 384, 2, 32, 1, 3, 0, 2, true, false, false, true, false, true, "9665fe28ea38105bd0c8da0216f70f60c49efe42d3fbc9641f5cbea6f63d7abb"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen", 205920, 384, 2, 32, 1, 3, 0, 2, true, false, false, false, false, true, "75e199e07c181de2e72c90b418343c24772964ee1ce54834738f72d770092b51"}, { DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 178272, 512, 2, 32, 1, 2, 0, 1, true, false, false, false, false, false, "c42b2c1674c9ef5ebaac53adb031aa0ab0f3a61d7549f7828fe7a5b0ac8a424d"}, { DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen", 190640, 512, 2, 32, 1, 2, 0, 1, true, false, false, false, false, false, "3a19628c75aa54a364d915cfa09808173e61703ff125d5d215a6d9e335bce360"}, { DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155232, 512, 2, 32, 1, 2, 0, 1, true, false, false, false, false, false, "0dccfa2739d490d1eabe9c4219a7ff41da31cff89f69cf9f6c8d7cf7a2e1d4f2"}, { DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen", 169648, 512, 2, 32, 1, 2, 0, 1, true, false, false, false, false, false, "79a0ab354a360a26eb4c9a67c55ff054f631df2adc8b254379207b715e5c5658"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 182464, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, false, "5c59cff7ff499719643692765b81185cbf1267fe5eb67dbec6f169b5c74912c3"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 160944, 384, 2, 32, 1, 2, 0, 1, true, false, false, false, false, true, "ef5ee83338c01c2ccb65983930d7a168c029782b08f0cae45fc3bbbe7974daab"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 64, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen", 173312, 384, 2, 32, 1, 2, 0, 1, true, false, false, false, false, true, "0c54f792432de07b84f652bb5cffc569c16f7d81602ee15a4388ab92d0ac06c7"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 146992, 384, 2, 32, 1, 2, 0, 1, true, false, false, false, false, true, "87ec3fef7316bf15eecd99b510665bec9835123e4b7fc69f31d01d8f948c9d66"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 64, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen", 161408, 384, 2, 32, 1, 2, 0, 1, true, false, false, false, false, true, "309db09766b122123d1eead7782f8f1c98d7fbceee9cd6671f5b5612a0c2f66f"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 182544, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, false, "4e462514e0c89ddaa624a0f9a2853dcc491ff8f13457d9b46edfed1be14c97a4"}, { DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 178272, 512, 2, 32, 1, 2, 0, 0, true, false, false, false, false, false, "189b1514e6b71c557ae077efb299ecddbcce16c6985902083489ea660ead7c8d"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen", 194832, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, false, "98b91778fdbacf080ca76edddedb8579f3f866a0fa7e721aa592128b20434224"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen", 194912, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, false, "a2b1b0958eb88a3cb8477cf1efa68c2a2c6ac547e201b54dfd600f70e375132c"}, { DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen", 190640, 512, 2, 32, 1, 2, 0, 0, true, false, false, false, false, false, "804b3bfbd27948ffb162409da0a7602dd0204ef9936bb994880896cb7e782d74"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Persistent2CtaKeepsAbForGen", 223960, 384, 2, 32, 1, 3, 1, 0, true, false, false, true, false, false, "6499b8313c28936bb441feb0701e9d90b39024e50a81426444f051047dcf8586"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen", 205984, 384, 2, 32, 1, 3, 1, 0, true, false, false, false, false, false, "f6cdc29b4beed32f24b467dc1e8316a5d9adabd88631b384db3757bb964cbda1"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Persistent2CtaKeepsAbForGen", 224040, 384, 2, 32, 1, 3, 1, 0, true, false, false, true, false, false, "8dd2bf316325a953d1f62a484d4e8223a7a04fe4a64bfd11bf0aa77bf2a823b7"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen", 206064, 384, 2, 32, 1, 3, 1, 0, true, false, false, false, false, false, "a6d2b57976e31f3cfd109072487f900ab5e8e4f2610e52ca48dc94c6055c7a2b"}, { DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Static2CtaKeepsAbForGen", 223864, 384, 2, 32, 1, 3, 0, 0, true, false, false, true, false, false, "80ef64c5e2ea4dd1f344cb1c084df48c8b6c10d90b10b9e6ec6157f073ea6516"}, { DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen", 205888, 384, 2, 32, 1, 3, 0, 0, true, false, false, false, false, false, "e51c59ff8b38c1e28f8409c60a1a91e45d330cd7f642edf328645d99194ee424"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 157376, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, false, "3fe3e7d460b3ee88f62582c6314ce8a40df98d2bb10b0a24ef3e8e7e8ef0b972"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 157456, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, false, "0337193790be7a9425ebbcac0bc7ef4057a753969dc8841251203776a11397c1"}, { DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 155232, 512, 2, 32, 1, 2, 0, 0, true, false, false, false, false, false, "19a4dcf9b4d88fc573b4e24b3fa671f48f8db448c51e782bf8809b502385a991"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen", 171792, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, false, "f0fd02ab281c6882d1ed985823b69e5095775ea6fcbb078de28caa354cc8ff71"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen", 171872, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, false, "050f3a2b0c0f6209470fb5106d08099e29d799b144a95ced6c385b30a4b304f3"}, { DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen", 169648, 512, 2, 32, 1, 2, 0, 0, true, false, false, false, false, false, "734eb113fed941f5294f004191d7f4bed651c87bddb74116e55ee42432b8ad29"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen", 165216, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, true, "c9ea3f3f3f682ed33d956e3162d60d91a7bc03592f43237bd067308a99c7cd75"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 160944, 384, 2, 32, 1, 2, 0, 0, true, false, false, false, false, true, "179234022192ab60aad40ca78f25aaf1b3165511215afbe3654d62df27f056a2"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 64, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64PersistentSwapsAbForGen", 177584, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, true, "b070a0f563fffbbd3193320b243cbd764ccc0bf6c5093b591676a1339d24ce8f"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 64, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen", 173312, 384, 2, 32, 1, 2, 0, 0, true, false, false, false, false, true, "caa34b58a7004cbc57c0704b104ea69a8c4c1faa0194444a64d17fd9909c340d"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128Persistent2CtaKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128Persistent2CtaKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128Persistent2CtaKeepsAbForGen", 224056, 384, 2, 32, 1, 3, 1, 0, true, false, false, true, false, true, "4891c22d31fb5186ce805efb994c19f75085bf0d76fda7c14ee915866b41b082"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen", 206080, 384, 2, 32, 1, 3, 1, 0, true, false, false, false, false, true, "c2956e2cca2afb1a4d92ab75d02c52d7defbc6d6cc216c6ad013cc7941c57400"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128Static2CtaKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128Static2CtaKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128Static2CtaKeepsAbForGen", 223880, 384, 2, 32, 1, 3, 0, 0, true, false, false, true, false, true, "c1bd927a9fdcca2200963c1c90389efecfe1db050c2e80b7affe43299d143b51"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen", 205904, 384, 2, 32, 1, 3, 0, 0, true, false, false, false, false, true, "5e13e6b2f3a46a983bd270ca0d22120dc655f1ccb51dc919b45b1c9312ea8d67"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen", 149216, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, true, "a2e588ca50c420a36d984ffadd6da0c786d5afac1432953290f9dda869e311a1"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 146992, 384, 2, 32, 1, 2, 0, 0, true, false, false, false, false, true, "9a901e78d49e36820f88832a1167df43f3c3b361d2da37b7f3d1afe96c1019ea"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 64, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64PersistentSwapsAbForGen", 163632, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, true, "5d4e2de49928ca34f77f60aefac44955fa3072bac250baddcdc5027880539967"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 64, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen", 161408, 384, 2, 32, 1, 2, 0, 0, true, false, false, false, false, true, "35d2ec3c5eca0453db563af25307cb5dbdc07e3f5aea42fdb4b6af3c0ae61044"}, { DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 198216, 512, 2, 1, 1, 2, 0, 3, true, false, false, false, true, false, "5e4d450a8357b2fddb62c463cdca78964bc2b28d044b41cac2da197fe2c69533"}, { DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 184520, 512, 2, 1, 1, 2, 0, 3, true, false, false, false, true, false, "a5310fa5b923590cc052a4fafe488152924ba64cf48423c0f15e887191d7ae41"}, { DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen", 229256, 512, 2, 1, 1, 3, 0, 2, true, false, false, true, true, false, "da9236e90508408c161cf61611411a19246511b32e25a7cfe65ae920c0edcb73"}, { DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 164928, 512, 2, 1, 1, 2, 0, 1, true, false, false, false, true, false, "234ce169292ce8ef7e3f785144e5653f86bc129a04b29ede414750df5f9c27bb"}, { DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 151232, 512, 2, 1, 1, 2, 0, 1, true, false, false, false, true, false, "8e50a1eafcd20cc7ac84020812e070465de7225a54849c95c67c47e0b6e605dc"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen", 169120, 512, 2, 1, 1, 2, 1, 0, true, false, false, false, true, false, "9dada75551a5ca5ea670606dbdac4579c9ef58f893b7244821c33c9b5cb3d11f"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen", 169200, 512, 2, 1, 1, 2, 1, 0, true, false, false, false, true, false, "d730b19f453c78c3d28dc177963ab6d3ffab79df02b2f6ed9d04262c252ada47"}, { DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen", 164928, 512, 2, 1, 1, 2, 0, 0, true, false, false, false, true, false, "c6a1fd526ed818c5da48e25f8a68395b0d89b6e0d563c96bb6efc213b2aa987c"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Persistent2CtaKeepsAbForGen", 229336, 512, 2, 1, 1, 3, 1, 0, true, false, false, true, true, false, "3ac0aa68a5d26c06282cb88cb714763f445b2e6af76bc0c0d1053a0e888b9082"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Persistent2CtaKeepsAbForGen", 229416, 512, 2, 1, 1, 3, 1, 0, true, false, false, true, true, false, "d7c69a6c6355a6efe76012d35d387930b718230d481bc435adf50602dcca2a9a"}, { DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Static2CtaKeepsAbForGen", 229240, 512, 2, 1, 1, 3, 0, 0, true, false, false, true, true, false, "28aead4bce95a2f5b95c36848ad5f3399eee9c99252ff4d3605bae0eda58bae8"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen", 153376, 512, 2, 1, 1, 2, 1, 0, true, false, false, false, true, false, "e9ae5a609d77fe215ae333a43510b19b62ea8f1d1bd526d1db81be9437a7e6b7"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen", 153456, 512, 2, 1, 1, 2, 1, 0, true, false, false, false, true, false, "4c8e34e5ec6aa359c7ded813b1451ab9f9036d9fea80bfb8e7fb98304fd73683"}, { DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen", 151232, 512, 2, 1, 1, 2, 0, 0, true, false, false, false, true, false, "b12bf763710d2e21707b290734a8ed5d8b600831f098d92b72b00a2495beea59"}, { DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 211304, 512, 2, 32, 1, 2, 0, 3, true, false, false, false, false, false, "657649d448bf1ab1f5e615727d81195be97782c6dcc05084b75be369c9104ded"}, { DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen", 223672, 512, 2, 32, 1, 2, 0, 3, true, false, false, false, false, false, "4dd05004d1391b3c8613f75c9115f26e64b0bb247195c855963ea97a33e727c7"}, { DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 188264, 512, 2, 32, 1, 2, 0, 3, true, false, false, false, false, false, "9619b990bffb91ff43d2dd51df3f999e3cc547fb0526f8083860c70c4bc659e3"}, { DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen", 202680, 512, 2, 32, 1, 2, 0, 3, true, false, false, false, false, false, "dcb3566556e8ad24c0ff09e15ccd9553af1435d7a5f6753ad5bcf76f09af9824"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 193976, 384, 2, 32, 1, 2, 0, 3, true, false, false, false, false, true, "b153c5f869c6f0ec5f02fe2dd9ce64f31f692fe5955f32d628d9ba8f234e5847"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 64, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen", 206344, 384, 2, 32, 1, 2, 0, 3, true, false, false, false, false, true, "5ff2b043c4b9f643ebbee66b96ace430f6df5a7190c93e4b876412ee9d84b695"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 180024, 384, 2, 32, 1, 2, 0, 3, true, false, false, false, false, true, "9eb642a470c177296dab10a6f73f0bfb686fc8d785b668c60108afeec94d1679"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 64, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen", 194440, 384, 2, 32, 1, 2, 0, 3, true, false, false, false, false, true, "e10ebebbfb674534b3852e5ce1a5d2c515b91e42168d158e7e0860db7c7b2d6b"}, { DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen", 205904, 384, 2, 32, 1, 3, 0, 2, true, false, false, false, false, false, "46c30ba02aa3d1c7a1dd22c1f6ccfd5be9e99e9a616c498f268b805b5cb766fd"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvGmemSepVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvGmemSepVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvGmemSepVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen", 205920, 384, 2, 32, 1, 3, 0, 2, true, false, false, false, false, true, "e03931e12a5a71665de627e931edff2f7de9c4f4c915554f1160513ce548fd81"}, { DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 178272, 512, 2, 32, 1, 2, 0, 1, true, false, false, false, false, false, "4261bdc11c02cdbfb19cb6908f2c74455d5314ed268517d567d894f1b15673ad"}, { DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen", 190640, 512, 2, 32, 1, 2, 0, 1, true, false, false, false, false, false, "56095790bdbf7d619656d4b2c6247bb5082c931fa0c710ea3422f02b69d862ee"}, { DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155232, 512, 2, 32, 1, 2, 0, 1, true, false, false, false, false, false, "daf2598a7f6b0316ac9dffbe6b838cc459a00b2f6007a7ac387ec20a6f7e7033"}, { DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen", 169648, 512, 2, 32, 1, 2, 0, 1, true, false, false, false, false, false, "377623435c693cab4dae627f7f8c28ea4bfa7b9ce0f05ebe67c0fa6e04628f69"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 182464, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, false, "a143395862bb5d81d6c020c847611b3128c8a99e268880c5af1897a34354cf5b"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 160944, 384, 2, 32, 1, 2, 0, 1, true, false, false, false, false, true, "75139ec9a43e6d374c5aaad1c54df90267dadc0172f78ff2cd83e75166491fde"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 64, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen", 173312, 384, 2, 32, 1, 2, 0, 1, true, false, false, false, false, true, "7e59f9802c65aa956d901aa15aada406c1e85b7535b1bc0b6ab88a7c82193602"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 146992, 384, 2, 32, 1, 2, 0, 1, true, false, false, false, false, true, "9a196f0a0f291c72e1bacb3f84a66eadad4ff01de03f8aa383aa8f37e8f18236"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 64, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen", 161408, 384, 2, 32, 1, 2, 0, 1, true, false, false, false, false, true, "7f82327167f66f08f1bfd5c0547255ae5c47911a6c3eed31fc69ffaa136b044d"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 182544, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, false, "6afd36a748adb8d55a7b8e118918ff8d2b8b23958eec159d3845539b0335fe4a"}, { DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 178272, 512, 2, 32, 1, 2, 0, 0, true, false, false, false, false, false, "28efc69351348dc1ddb7464ae63865f06a399cdfb3a2e2c822fa94ff469d1519"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen", 194832, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, false, "135e9733d40120c20e2008a1906dfb816885559c331f1ca1cdc36837afd68ed0"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen", 194912, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, false, "033c0bc92970eb37257878be314832d600e00595482bff3530c018aef21fe9e5"}, { DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen", 190640, 512, 2, 32, 1, 2, 0, 0, true, false, false, false, false, false, "552a600d83cf74d09f74189dd0bdc83ad326de6c96775767b1585d95897e8842"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen", 205984, 384, 2, 32, 1, 3, 1, 0, true, false, false, false, false, false, "074886ca40756dadd09b899c0b4f0aa9a2d85a474b5a9e8ac75054020bd26cc0"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen", 206064, 384, 2, 32, 1, 3, 1, 0, true, false, false, false, false, false, "6d6274551e59236338bedba310b64b96380011a476744d22caecadc82ac4eb35"}, { DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen", 205888, 384, 2, 32, 1, 3, 0, 0, true, false, false, false, false, false, "d906be31c72afd3649af365ccb1598761709241a95539ae7779cf3198cb32692"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 157376, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, false, "4ac0c453950704c662d3d2e9774936c3f8bc70bb9cc68420cbf0d02ea78fa422"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 157456, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, false, "a74b1e416ee08bee596efca8e43f86959c311a1f702f767a843fd4b6dba0ef88"}, { DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 155232, 512, 2, 32, 1, 2, 0, 0, true, false, false, false, false, false, "54787f41224fc993178a07d8d30576d48197f01b60987ff53ff5ff0a0ede6aa4"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen", 171792, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, false, "dea5b93bf4dc39e60a3e4060fb29a3cd0f25f858a37e25563adfe6d5af427530"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen", 171872, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, false, "38fa4017c014ae5c6d8abf70e597bea5197670e3ada102077799bd6a4623ef4f"}, { DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen", 169648, 512, 2, 32, 1, 2, 0, 0, true, false, false, false, false, false, "a00b832a64e95b4101c7df89dc481f1553b7ee943b4515e19513aea7e06d0c6c"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen", 165216, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, true, "60ab9e88b53c3fb6ae612dd5e8ac59f475c31802f9c3148311fd49f479710cf0"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 160944, 384, 2, 32, 1, 2, 0, 0, true, false, false, false, false, true, "9b6b8933d4533151c8f4c27ab72c82eaf64dbedbf5a8a896017e7ea877817dd9"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 64, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64PersistentSwapsAbForGen", 177584, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, true, "1b150f001bdef69daf77f9c1e7e929f3b6ccdb02c24a890109c4f088ce9a258f"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 64, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen", 173312, 384, 2, 32, 1, 2, 0, 0, true, false, false, false, false, true, "14efa252d5e78d94a495e7f340627f41d58b396763d0f5777ecc4d07128397eb"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen", 206080, 384, 2, 32, 1, 3, 1, 0, true, false, false, false, false, true, "e1268df62f894e8a76ce4bbb73210cbcb0154e1a24cb41b89865ac8ddd4b5784"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen", 205904, 384, 2, 32, 1, 3, 0, 0, true, false, false, false, false, true, "4b2b81d443c8acf0096f829dfb599b176d4e691c9a10e70481efd59a1a2ac2ec"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen", 149216, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, true, "85e1fe2c09f330033773b9cc03020db3518a51a40c8aa507ccaacf4b1e145ace"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 146992, 384, 2, 32, 1, 2, 0, 0, true, false, false, false, false, true, "b244d385d94e013d489348bcb103d50730f61d4eda9f96bcbfe1d0c754c79582"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 64, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64PersistentSwapsAbForGen", 163632, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, true, "e783e8021b6946e46755b26fbc3957cbf3446b02672f42923bafc7cbf89767be"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 64, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen", 161408, 384, 2, 32, 1, 2, 0, 0, true, false, false, false, false, true, "04b888bd03f6c0143badf75d526b41f61757e26f42bedffac76401add21b72ee"}, { DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 197960, 512, 2, 1, 1, 2, 0, 3, true, false, false, false, true, false, "bc99400a58844616123a4111bb51d8923bd6e94ab5ac8d05a8d9a86524d5ed03"}, { DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 184264, 512, 2, 1, 1, 2, 0, 3, true, false, false, false, true, false, "26f1b34cddcf6052271197f96edc034743b4c33d241006746ed6c3d1d1d82380"}, { DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 164928, 512, 2, 1, 1, 2, 0, 1, true, false, false, false, true, false, "b76587ac9e3f6cdb841ebef596a2da8f1e5cfcf0eaa4b6a481bdf1121a84292c"}, { DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 151232, 512, 2, 1, 1, 2, 0, 1, true, false, false, false, true, false, "6bd87f30ae281ae871ae80bc3c32d89dc183f6a1d84e37fb604e3a5b49094304"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen", 169120, 512, 2, 1, 1, 2, 1, 0, true, false, false, false, true, false, "67e574a1175b04f6d21cf797877eab3d67d703e20f216e57c7db4f1bf4d134d1"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen", 169200, 512, 2, 1, 1, 2, 1, 0, true, false, false, false, true, false, "0c487b29c84d336ec83dc5678fab9010aa7ce84d7e955282d606071db5581e6a"}, { DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen", 164928, 512, 2, 1, 1, 2, 0, 0, true, false, false, false, true, false, "8da0a5187065408081f89583aa5411bb9c7e74c7166c62488d0ca0043bde049e"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen", 153376, 512, 2, 1, 1, 2, 1, 0, true, false, false, false, true, false, "830511f2153fec310ead1e3a464f363ddf6e517f5a650d797dbb9e37b58d4bff"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen", 153456, 512, 2, 1, 1, 2, 1, 0, true, false, false, false, true, false, "386d09e6d6b90d8470dcbd27a9674cb0c0fc03fc4bbbe794d54d57fa0163a4a6"}, { DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen", 151232, 512, 2, 1, 1, 2, 0, 0, true, false, false, false, true, false, "310848bbde309c7361c304b3c2b63ba56fc1abc884db189dece7ee2791f0866f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 1, 0, 1, 0, false, false, false, false, false, false, "7b7963270df891fe0754a3d97e5c0fd801c9b327ab79e71ebe0090b84ef06a57"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext", 82336, 512, 1, 0, 1, 0, 1, 0, false, false, false, false, false, false, "8286823bda167201b520e3ebc972fba664189f4fb2b24331cbcc63d0527dc825"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 1, 0, 0, 0, false, false, false, false, false, false, "12784806d4897097f720805cfea0301a79f1c545e5eb6c0d7386d71c799cc8ed"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 82272, 512, 1, 0, 1, 0, 1, 0, false, false, false, false, false, true, "dc9febc312ec463906a447ba74bb4a1be03f377e89e7ceb1e7daf9930673d144"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 82352, 512, 1, 0, 1, 0, 1, 0, false, false, false, false, false, true, "4c58379052ad2e52330ffe3461cfcc03b75fdfd67e5eb46bd73b87c82ae78efa"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 82176, 512, 1, 0, 1, 0, 0, 0, false, false, false, false, false, true, "3e91a462ad16288a7424933032d6abba9043448fc02f926a2036c7ad104295d1"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 0, 0, 1, 0, false, false, false, false, false, false, "1f65f188c9e3e420226474f11fbc27128699bef1c4e8b96b48ac13683fce1955"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext", 82336, 512, 1, 0, 0, 0, 1, 0, false, false, false, false, false, false, "ad8bab4fc9e40a9fa9380bebbbf6d469cef4a9c6a460b28d4334c3e911fbcfa7"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 0, 0, 0, 0, false, false, false, false, false, false, "49cdd07ab1ed67067e34d143ba69b8e13ba0645bf736c76c27c9777118c0cb4a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 82272, 512, 1, 0, 0, 0, 1, 0, false, false, false, false, false, true, "a515f2d8410759cb316ec4adac58754ced16c5b470207395bb65b0d5d0e4a0c9"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 82352, 512, 1, 0, 0, 0, 1, 0, false, false, false, false, false, true, "b410ad81ea741ea4f5549fb63704d3f0c8fe6e1d07e427b8f770e25c2c592bae"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext", 82176, 512, 1, 0, 0, 0, 0, 0, false, false, false, false, false, true, "0173a77d8238c299b9a72eb3b8ea815997544a605b473a3ea5e07dd52116c694"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 2, 0, 1, 0, false, false, false, false, false, false, "0cec4f74fcebb8e5fa15cc9c8af059aa682c40ae9937658bd7f9c8ea724892b2"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 82336, 512, 1, 0, 2, 0, 1, 0, false, false, false, false, false, false, "6eb98b372ad587a74a8881b076bed55ea80c62f33007af8ae1f85bb6f9477408"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 2, 0, 0, 0, false, false, false, false, false, false, "c4160df0411daf40eb331e295574800d7673b9b905358ac7da50b4315608b968"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 82272, 512, 1, 0, 2, 0, 1, 0, false, false, false, false, false, true, "c089ead60cfcfd38a26f5e2aa4479eea197f47d804f6ac35d304105299d2f17c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 82352, 512, 1, 0, 2, 0, 1, 0, false, false, false, false, false, true, "cc2a5375a6140c1c606dbd49ce32513d8ecfd0ce4cf250137b6a8f4226b05b5d"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 82176, 512, 1, 0, 2, 0, 0, 0, false, false, false, false, false, true, "9b22457abe08709faaf2cf62869b91f7152247f1956b48326523d76c48fcd7d9"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 165056, 512, 2, 32, 1, 3, 0, 3, true, true, false, false, false, false, "a9a5f9d18c2e8ee080daddc334d59bbd33afcdd8465833f65aaf5d8f0dda3d29"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 191672, 512, 2, 32, 1, 2, 0, 3, true, true, false, false, false, false, "aeec086cb15ad95eb3d991a44c7857f150f4ec24b1d480a68ca75bebf1f867ac"}, @@ -4471,135 +5387,163 @@ static const TllmGenFmhaKernelMetaInfo sTllmGenFmhaKernelMetaInfos[] = { { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 32, 128, 32, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen", 167088, 512, 2, 32, 1, 2, 0, 1, true, true, false, false, false, false, "a8838bfdb2bb5c4ef675dd998f65ab98ef9bc5da6b5cc24a3c63b40b783df7b5"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen", 156848, 512, 2, 32, 1, 3, 0, 1, true, true, false, false, false, false, "e726397c34ac81fe7d3ef3b639869f1a5fd6366f6fef37b5908a8be32c200fe0"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 32, 1, 2, 0, 1, true, true, false, false, false, false, "7327019721c8efda13662257de71e9f46acd60656782db76ca39dc1973660f35"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, false, "d9622196b4052204c334f2ce6a859b6f5d4cc2a3b8a57d9c088d77f15d3d8b49"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen", 181520, 512, 2, 32, 1, 3, 1, 0, true, true, false, false, false, false, "550382e05637c199d753bd1edfa0f12525c57d76b0706c7fbc8c029fbc558049"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 83200, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, false, "b98440334f08110ce9e8c7fa08d7ad8edbf295fd118bac9db18225f8377fb8f0"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen", 181600, 512, 2, 32, 1, 3, 1, 0, true, true, false, false, false, false, "a592c5d1e202cf78f1f58eacb6e52c2c549927101119b1e8f7937e1965dc4d5d"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 1, 0, 0, 0, false, false, false, false, false, false, "93199886007c7e7b228a0928ddbadfb75a6ce87e12240b9e78d115ac0f24696e"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen", 165024, 512, 2, 32, 1, 3, 0, 0, true, true, false, false, false, false, "43b45f995cbf0a9151a2fbda1f1f70a3c4375533b37ac0b9e85bb1393539baa3"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 162064, 512, 2, 32, 1, 2, 1, 0, true, true, false, false, false, false, "068e0eb75d8a4066311723e9273ea640cbdb4f1dea39b9490dce3ad6dc2ff23d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 162144, 512, 2, 32, 1, 2, 1, 0, true, true, false, false, false, false, "ea6af9febcbfc6213696ab612d9158a494c1aea6212d2b4f571f67616e0c75c4"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 32, 1, 2, 0, 0, true, true, false, false, false, false, "f49153033e0bc19f9acf1a0f6d2c6e6700635e3226f70e9fba0ebe282b6e7d6e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 32, 128, 32, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen", 175376, 512, 2, 32, 1, 2, 1, 0, true, true, false, false, false, false, "6fef9c5de481eb3fff9e96891aa972960cea7bee8fa9f779bb9a1411fa9671fd"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 32, 128, 32, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen", 175456, 512, 2, 32, 1, 2, 1, 0, true, true, false, false, false, false, "e1ced316c294d354c0ed40e0f4c646b90eb2194ec5be40d467985dbcacf96c58"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 32, 128, 32, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen", 167088, 512, 2, 32, 1, 2, 0, 0, true, true, false, false, false, false, "4a9e7fe1d0f28af4d094130474b0aec9a089237ac1225f28a1938b71405de037"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen", 165136, 512, 2, 32, 1, 3, 1, 0, true, true, false, false, false, false, "4fc731c35192cb9db81355528c4cd1a6767ac19f6fbdc278f6e670cf9f1cfcc5"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen", 156832, 512, 2, 32, 1, 3, 0, 0, true, true, false, false, false, false, "ac8fdd7c1d90479f7b5996895354dc67d7488007601c5ed9eeecc5417c5bfd30"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 155408, 512, 2, 32, 1, 2, 1, 0, true, true, false, false, false, false, "86e63be1f1124a6eba23fa52278b0e0a7947869173ba417e13df75bcde992f9c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen", 165216, 512, 2, 32, 1, 3, 1, 0, true, true, false, false, false, false, "1ac4dc718915fa07afe1623484bac5d8cf6ed5318aa73c6f6471735b5f978887"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen", 156832, 512, 2, 32, 1, 3, 0, 0, true, true, false, false, false, false, "2174edd4f711a0fa66fd2064e4e2f20cc2259dce92c4983b4294825aa580dd67"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 155488, 512, 2, 32, 1, 2, 1, 0, true, true, false, false, false, false, "789d262e23c1b40f27fe6066a911c92695f136cbb4c4363f7d4fdceff4a5ebae"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 32, 1, 2, 0, 0, true, true, false, false, false, false, "e3b2e8d11bb62d069b2849950955b590f0f22b2e6b5f6689901bc758369c8359"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 83136, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, true, "1b65bff1346a558065a7cdf5324ca5f8c52b154ea8631fa8389fb037af76bb45"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 83216, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, true, "391c75558653a628773f895b6376fad3d6aefbb0ab0ce46bf160932734432325"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 83040, 512, 2, 32, 1, 0, 0, 0, false, false, false, false, false, true, "36fd82322761d686f48b2ca908c668f5d58b930107550e6d395ea3826b7d0713"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 165056, 512, 2, 32, 3, 3, 0, 3, true, false, false, false, false, false, "9d7b9a5042592f5f3fcdeb4ad60beace41121685e757546d502a7806b40e4c02"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 165072, 512, 2, 32, 3, 3, 0, 3, true, false, false, false, false, true, "e6f78e603f92b26c8013b0cce8204cee7c72b3fc3390ef95bf3dcdb4f4b4832e"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 165040, 512, 2, 32, 3, 3, 0, 1, true, false, false, false, false, false, "7f67979403e3ed72f1c18eef4aa07a409dee97f98584e92b81a942093ad61f89"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 165056, 512, 2, 32, 3, 3, 0, 1, true, false, false, false, false, true, "0c11cde823c38b0cff2488fdd1f7329afa99bcc7d02ddfc9788a063df9324c50"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen", 181520, 512, 2, 32, 3, 3, 1, 0, true, false, false, false, false, false, "d88bab9503d67382328c7f14e6c50a8ba9396b1d9dd8f7fad5df9d51a0a6b658"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen", 165024, 512, 2, 32, 3, 3, 0, 0, true, false, false, false, false, false, "1d81abbe8a85ce963f73c098fd6626fb707b490c07cf0bcad1ac03d819116a6b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen", 181536, 512, 2, 32, 3, 3, 1, 0, true, false, false, false, false, true, "fcf1a24599fd158762f30f8908300b1ede5a6beadfdaa3ae56c0a7955d94691f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 165040, 512, 2, 32, 3, 3, 0, 0, true, false, false, false, false, true, "b28216373592d6f66aba018ba8b30ef1b2f3748c85922eed97e6fd7b46e46b65"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 192792, 384, 2, 32, 1, 2, 0, 3, true, false, false, false, false, true, "b15b2f78c12f5931b692ad54b8ca07ddb1c06aae897ce69fe1ba4de3dc83da2c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 188056, 384, 2, 32, 1, 2, 0, 3, true, false, false, false, false, true, "b9ca25069db80478aeef153cd6d6b33cb73c8b4bfb0420d3c33144d8b792bd21"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 158992, 384, 2, 32, 1, 2, 0, 1, true, false, false, false, false, true, "bae3bbb2275464f21034bf651d457864ba1d5cf8e1fea8046de3d5f5e0f25494"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 154256, 384, 2, 32, 1, 2, 0, 1, true, false, false, false, false, true, "6f258e052580907bdc1c73ae850cbaa118474c62adc3c94bb9dc019175f1416a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, false, "10dfaf24c42e5ccc246496db971e5166036c63d145f25dcc49b5244bb84b92f8"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen", 181600, 512, 2, 32, 3, 3, 1, 0, true, false, false, false, false, false, "049f51463da3cae2d1f006ae0ec1180bc496e7ff0fb37b884ccdec07af5d02dd"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen", 165024, 512, 2, 32, 3, 3, 0, 0, true, false, false, false, false, false, "97cc54891041d4a071694b9cd212f5382ecb69cf34a67daec5f63ed04e944337"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen", 181616, 512, 2, 32, 3, 3, 1, 0, true, false, false, false, false, true, "139bbf203ef71ca3df2dfbd96bcacce8d6ff20f955cd507e2146ba0c0408e0a8"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 165040, 512, 2, 32, 3, 3, 0, 0, true, false, false, false, false, true, "3ee9483f4fff391e61a7ab96c73a461c91cf32ea65fc22245f930c91d9a01544"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 192792, 384, 2, 32, 1, 2, 0, 3, true, false, false, false, false, true, "bc22c56256999e97dfdd6f34470406a013b2ad7a9e4b413e7f9b3bb800212091"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 188056, 384, 2, 32, 1, 2, 0, 3, true, false, false, false, false, true, "39631729e761d5d4c79258c73014a89a1143a37b576cb71ef6bedcd613eb5c39"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 158992, 384, 2, 32, 1, 2, 0, 1, true, false, false, false, false, true, "33ba7241eb630b96c9e2bc1423068b7ce8c94a48863190cb2bff86e739753cc2"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 154256, 384, 2, 32, 1, 2, 0, 1, true, false, false, false, false, true, "a6863bd263952e26cb9dcb9e81720316bf2e34fa6f764f4aa07291ff27058f85"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 83200, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, false, "fbb8c56365be45cdf42e41169d00ff24c6770b8ed1570bd65306d6fbe7e370fc"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 0, 0, 0, 0, false, false, false, false, false, false, "1352d7c54ff51a6779a208a1d3163967c9ac62bf31d1f1491d38b3341de91f8d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 83136, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, true, "02edb07eb61dfc0412e7deeb4cc5f485a8f605a49ac6393637dcfe70f3e1401f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 83216, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, true, "d228c07b5470f1ec265a6e8a7d30909bee8837d5db3ff3c845af2a5af2504db1"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 83040, 512, 2, 32, 0, 0, 0, 0, false, false, false, false, false, true, "947decc41cc8dd533fecfa28a84d3faa730e2fc555802729cb96ef74acc8fe8a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen", 163184, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, true, "15e635ee0400bd24b931eaac44965dd14b9d73be6f049b8ce7d9993f0017c6f3"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 158992, 384, 2, 32, 1, 2, 0, 0, true, false, false, false, false, true, "1102357342d65c0a30f39cd42da7c2ceb1f466d2c808d36cdf2f7f3eb5da831c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen", 156400, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, true, "d0f94351129d176cc6a17409ab7397da80533aca026337030cbed05ca8f36362"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 154256, 384, 2, 32, 1, 2, 0, 0, true, false, false, false, false, true, "9188957c50b649d7f3de17d7ecbb9c96d25eebcc4c772330cf71d92aa1e19176"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen", 163264, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, true, "9fc3b0db752d39cf57eac04eee839d68fb6f301401a65919bec71d54e8bba05d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 158992, 384, 2, 32, 1, 2, 0, 0, true, false, false, false, false, true, "efe087188696d4eaef8f864f793b2b6a6593540a66703d0d410032f254e478b5"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen", 156480, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, true, "5bb128092cdaf02f861f2085738341f02ae82da06efa1a6e2ddafc9e8bf54742"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 154256, 384, 2, 32, 1, 2, 0, 0, true, false, false, false, false, true, "a889f959610347d1c062139aa0968e671e41f400d53c935464046816fe78f45a"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 165056, 512, 2, 32, 2, 3, 0, 3, true, true, false, false, false, false, "91c20e71a5263486464a69026673ac2950b49d4710666506004178b5b500a13e"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 191672, 512, 2, 32, 2, 2, 0, 3, true, true, false, false, false, false, "5c4133ce6e26e7299dd88f9b09b5a7b543d88faacdb4ddb9ae657511a3631c96"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 32, 128, 32, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen", 200888, 512, 2, 32, 2, 2, 0, 3, true, true, false, false, false, false, "2de271dc40d6cb72bf6a1077e4f1716f93abeb3123ca64a7de70c2de7f71b990"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen", 156864, 512, 2, 32, 2, 3, 0, 3, true, true, false, false, false, false, "d2b5a15795f8ba559945c47fa0a63150b7f2780df740f780f295a4f863686ddd"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187064, 512, 2, 32, 2, 2, 0, 3, true, true, false, false, false, false, "962d095dfff5e327fb0468a04b94e5d67d64aff5d60bf393508e3a14569e09d0"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 192792, 384, 2, 32, 2, 2, 0, 3, true, false, false, false, false, true, "f8ce479ff68e9139666d9277a34ffc5daa8504ea216933f55560149ba505cc2c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 188056, 384, 2, 32, 2, 2, 0, 3, true, false, false, false, false, true, "76903bb98a7b1b2cc5d30528a8d1bf0130672310b43304a4b53e5a13026c7b02"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 192792, 384, 2, 32, 2, 2, 0, 3, true, false, false, false, false, true, "6e2d898a9b2301d13b216fa8e3c8c9598d6656ed3d95ae93174f63f587ea7611"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 188056, 384, 2, 32, 2, 2, 0, 3, true, false, false, false, false, true, "935a4f10aee9f2ce920c74b88ae203babb0d58c147d78e40e384f95e4d41b877"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 165040, 512, 2, 32, 2, 3, 0, 1, true, true, false, false, false, false, "f636f951f534275ed148d9867204e8b0ea8658520bd54e42b372d323df86929d"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 32, 2, 2, 0, 1, true, true, false, false, false, false, "957ac184f5648c248d6542ed17fc3ebebc45a269c22727833982d53e62e0dcc8"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 32, 128, 32, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen", 167088, 512, 2, 32, 2, 2, 0, 1, true, true, false, false, false, false, "fd79803c97ea8b1e8d15ef483061194e15d7af73694422ec6cfbb46192d50a75"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen", 156848, 512, 2, 32, 2, 3, 0, 1, true, true, false, false, false, false, "041bc4f4a29f1c75b7248a9ed259808e7087f3b9416c688161eebcc4a932b660"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 32, 2, 2, 0, 1, true, true, false, false, false, false, "9be8bbf3d9bbd3ccaec44f85e97fc0b4a264dfd1202f80827789af5041af2e47"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 158992, 384, 2, 32, 2, 2, 0, 1, true, false, false, false, false, true, "040d68b469820b6fdfb20580095dfd216b4334b5e6802d361c04ae21ade8c5c2"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 154256, 384, 2, 32, 2, 2, 0, 1, true, false, false, false, false, true, "0178acbe0cfc9253233f94ec9bffc916b01a4b687feadf47a0cc4d1785fd8975"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 2, 0, 1, 0, false, false, false, false, false, false, "d5d0ab59911efe21201b89e620b24653eaac080eca6c851730c4bf8bf2bd2afc"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen", 181520, 512, 2, 32, 2, 3, 1, 0, true, true, false, false, false, false, "90cb193551e4dce8e3bba0419d956d3c1128c596541f4547d1bbbc9c882b5e1e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 158992, 384, 2, 32, 2, 2, 0, 1, true, false, false, false, false, true, "fafc69df6dc85db7711d8fdfdb8440fd36c64ae8f2f04d41478e533185685996"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 154256, 384, 2, 32, 2, 2, 0, 1, true, false, false, false, false, true, "f611097c466ff4fa823733c55c522818d319e8831edc7843802fec1fb97f2aa9"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 83200, 512, 2, 32, 2, 0, 1, 0, false, false, false, false, false, false, "d502365372e9f06d670cbd4ac37fe2619a2a00c183ecfed01037d48b42310d20"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen", 181600, 512, 2, 32, 2, 3, 1, 0, true, true, false, false, false, false, "f160d05c74c94507fb9e33d239da79f49a64250e946e4cc70b5b41a62eba7e40"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 2, 0, 0, 0, false, false, false, false, false, false, "153b97b94f55f64d736a05a367657222afa5448818d4a7fee89434a7894b4fe0"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen", 165024, 512, 2, 32, 2, 3, 0, 0, true, true, false, false, false, false, "673a4d35d0ece8297cf9a406d92b8597a592f2fa410a0500535680668f955b73"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 162064, 512, 2, 32, 2, 2, 1, 0, true, true, false, false, false, false, "f984a28f3f36455387c86ca20b0794f8129229c7e8d4d93e998d15eb7810b41c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 162144, 512, 2, 32, 2, 2, 1, 0, true, true, false, false, false, false, "dc1db63c0f6cde3c299fb0e6eb64ec9759e768e4ecc2c6c7463198b67f3e7b34"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 32, 2, 2, 0, 0, true, true, false, false, false, false, "32789ec3033df646f1773869784ad14795347fbe65315ca0403c7e20afee0dbf"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 32, 128, 32, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen", 175376, 512, 2, 32, 2, 2, 1, 0, true, true, false, false, false, false, "dcbbaa121d2b9c57a96d536716f22b5184faeebac8c5a283b79451a6be791678"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 32, 128, 32, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen", 175456, 512, 2, 32, 2, 2, 1, 0, true, true, false, false, false, false, "63a7516d951c1e8a8b01f978dee4bd11901aca7bb14252738d1fb32dad7dba7c"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 32, 128, 32, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen", 167088, 512, 2, 32, 2, 2, 0, 0, true, true, false, false, false, false, "72831f8fbf72a3ca3d7f37d88def35a3f2cdaf5ef9144a60076715f0d3d36502"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen", 165136, 512, 2, 32, 2, 3, 1, 0, true, true, false, false, false, false, "d5f0977c5098eff43941cd3d7ed3d56b3dfed1d6d90fa7c9c3cb4fd5b4845d7b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen", 156832, 512, 2, 32, 2, 3, 0, 0, true, true, false, false, false, false, "f5638ac1ce3d348da0833c8bd0bf0881ce429f74425329f65b79ffed485aff64"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 155408, 512, 2, 32, 2, 2, 1, 0, true, true, false, false, false, false, "31c9a46a2a4c01610fc8753dc89ef43d560565f78dc10a229cd0f6a21d85a676"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen", 165216, 512, 2, 32, 2, 3, 1, 0, true, true, false, false, false, false, "74d7cb807886c0acd2231c8bdbd361b6f160d2a39eb200a6a539ec4f0eb9275b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen", 156832, 512, 2, 32, 2, 3, 0, 0, true, true, false, false, false, false, "479e6a19647d377cee0c3127788752881cef8b73a434cfe2a0ef7cbdf1082b9e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 155488, 512, 2, 32, 2, 2, 1, 0, true, true, false, false, false, false, "0b8ca02e970c927928e216261b2e482822ae322dea00d46293d986078ca11d2e"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 32, 2, 2, 0, 0, true, true, false, false, false, false, "f79af3d169540863b15308a8f0cd7e38116738ef9a72beb445b2111ed59e4740"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 83136, 512, 2, 32, 2, 0, 1, 0, false, false, false, false, false, true, "0cfdf85ed5813ea8e4bf047aab925d1fd1e1258af44a0f8b76f076a136b4561a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 83216, 512, 2, 32, 2, 0, 1, 0, false, false, false, false, false, true, "4f8fcf017fed4f17559fe65de62b00d2ce570fb4f0b4b06df104361210ab8186"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 83040, 512, 2, 32, 2, 0, 0, 0, false, false, false, false, false, true, "5ada333df1b541e0adb701a6aeb8efc020a7060e73ff9ad8b1c4b189b2d5df51"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen", 163184, 512, 2, 32, 2, 2, 1, 0, true, false, false, false, false, true, "e7d7d20ef84119c56b7e2b38bdf965bc4a56439eea1cfa6ce6fbd5cc1f5feb25"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 158992, 384, 2, 32, 2, 2, 0, 0, true, false, false, false, false, true, "881c28cf0f96d5cb26891cc647d0da24928c45b7e0c31f7e0e13eb4a6f8df2d4"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen", 156400, 512, 2, 32, 2, 2, 1, 0, true, false, false, false, false, true, "61aa30fc074ce9e97762fb3822678eb0c12be54fc5198b9bc7f9795b78706dc4"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 154256, 384, 2, 32, 2, 2, 0, 0, true, false, false, false, false, true, "3fff21375c791c3ed2989bbaa47331c7cffd39db6fb8925a496ea80e8e96bba8"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext", 213408, 384, 1, 0, 1, 0, 1, 0, false, false, false, false, false, false, "d7f2aa694e7647235aa31de92c0b476b1c8392cbc3320db0b22672fe98d89073"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen", 163264, 512, 2, 32, 2, 2, 1, 0, true, false, false, false, false, true, "727f4a67b0b1d3eeb925d8d225bd4c7d12c34a4835d28b6a7818c0fb618eec66"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 158992, 384, 2, 32, 2, 2, 0, 0, true, false, false, false, false, true, "94d32fdf0c71ecf1fe5d6139145a297534f8941da7092e4caf948ffeddec50c4"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen", 156480, 512, 2, 32, 2, 2, 1, 0, true, false, false, false, false, true, "3f6a5d00d0511eda5ab7d1e55f2ceed8bb74eac6d4e5891bbddaf3ae3e53158a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 154256, 384, 2, 32, 2, 2, 0, 0, true, false, false, false, false, true, "6a5e473cefed1e953cdcc6415ca9923af2fc222d0cf784e2ab698c7b69301865"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext", 213488, 384, 1, 0, 1, 0, 1, 0, false, false, false, false, false, false, "f9108a7a3fff242b51d487a712045a1b9fb0876496c8be8d5b0c0953be0164d1"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 1, 0, 0, 0, false, false, false, false, false, false, "4befd4ffcff7ca8de1d41310e923d11ee0935fb740b6342c2d38ca378206b7e1"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext", 213408, 384, 1, 0, 0, 0, 1, 0, false, false, false, false, false, false, "d776c3afc0b2445513188ad250d380e7e2f1b0ed2601d70ea833bb9c30171944"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 213504, 384, 1, 0, 1, 0, 1, 0, false, false, false, false, false, true, "ea20c7bb55a5a9e12fb45451c2f8ff01c9fdc5588ffc46881833a841db647732"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 213328, 384, 1, 0, 1, 0, 0, 0, false, false, false, false, false, true, "d6c472e17c7548945d7012337ca7be3cafbdd0a8e7abeed31227961467be87d4"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext", 213488, 384, 1, 0, 0, 0, 1, 0, false, false, false, false, false, false, "d89892244ff4d21fed02fe45c950c0943ae4355c01f377ada5b8cd4087c89a9b"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 0, 0, 0, 0, false, false, false, false, false, false, "65f4e9f450919c17b47969b979ddcafdb1c6bcface0a224d9185aca859009f11"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 213408, 384, 1, 0, 2, 0, 1, 0, false, false, false, false, false, false, "8f86d096c1f1fcbcca57b5109e02b3947c7c2735ca3fdcef67bb2ce719dc7471"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 213504, 384, 1, 0, 0, 0, 1, 0, false, false, false, false, false, true, "3518a5a217a65d8c26f78c23bb502e516c995119f65b1995edbd42893a264546"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext", 213328, 384, 1, 0, 0, 0, 0, 0, false, false, false, false, false, true, "f727b26afe130dc710fe3598f03e3fa69e8d1203bacbdde868c4d39ca8f1cdc2"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 213488, 384, 1, 0, 2, 0, 1, 0, false, false, false, false, false, false, "a543213e05b62fc498551488a19cdf3ed7f79eda77ab9f2d6490688e6ff2f0fa"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 2, 0, 0, 0, false, false, false, false, false, false, "a14fd4b763c26d11db3da11b6b370d0a055e246a238cdc709a12861e574916ab"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 213504, 384, 1, 0, 2, 0, 1, 0, false, false, false, false, false, true, "452a7b3e3ef421a4f5c422fed44f4576eec002f576dfdc7db13bb82ef5fe5fed"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 213328, 384, 1, 0, 2, 0, 0, 0, false, false, false, false, false, true, "3090bec3527cf6c439bfd0ecf1beb56a596149e5bf9b4780265bb06f121a8177"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 214208, 384, 2, 32, 1, 3, 0, 3, true, true, false, false, false, false, "4f30d2ed7143058a6db277bdc32278fda4a3099554dd78d3bc97455469b0b2eb"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 195256, 512, 2, 32, 1, 2, 0, 3, true, true, false, false, false, false, "692fb91f8636209aafde51c06a5321bc6642d4caec09d994d9fc1e0656000d76"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 195256, 512, 2, 32, 1, 2, 0, 3, true, true, false, false, false, false, "7f4912625d1cb2c81602dbb3f191c94d4e6332bf3634d3e65bdf70bb9b713170"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 32, 128, 32, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen", 208568, 512, 2, 32, 1, 2, 0, 3, true, true, false, false, false, false, "155ca5b4868a2740817280eea0c92f05b3f3fd43d1dda4f12efdbc8f833cf68a"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen", 181440, 384, 2, 32, 1, 3, 0, 3, true, true, false, false, false, false, "ef550cf2ef0eb3b4548151178f71401dab7613bb80164a549ff79362f700399e"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 188600, 512, 2, 32, 1, 2, 0, 3, true, true, false, false, false, false, "c13354b8b9efdee5f013f063be7f7141b5466de0d1db20714860a9c1ce7d9721"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 214192, 384, 2, 32, 1, 3, 0, 1, true, true, false, false, false, false, "d4a6410c3a1163a9875c60ec98175b22ca09e736c8396e139639c2216581e6b1"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 32, 1, 2, 0, 1, true, true, false, false, false, false, "ac8d69dcc3dcee4cd37181e79705e9091e5af788211ac1804e64aeb3fdb7cc78"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 32, 1, 2, 0, 1, true, true, false, false, false, false, "667c57060c29505dd7a37c64e5404990c21dbae2a0216080cd10e0a2504bc936"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 32, 128, 32, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen", 175280, 512, 2, 32, 1, 2, 0, 1, true, true, false, false, false, false, "7d56e04d886785e08c645ca25564d45d5f52de3b400c4b3f16474430aecf5733"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen", 181424, 384, 2, 32, 1, 3, 0, 1, true, true, false, false, false, false, "166dee35d63a96eac46ad53d577a2393d74f5b7263794eeea136b989b015d2dc"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 32, 1, 2, 0, 1, true, true, false, false, false, false, "21fcaed9427834384447e82f5440113860a93a97c1da008da17d93de3a2dcf4c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 32, 1, 0, 1, 0, false, false, false, false, false, false, "7a47ee62de52b6f058131490e04636cba75bfb6c0f7b053ec305f9f592e9652f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen", 214272, 384, 2, 32, 1, 3, 1, 0, true, true, false, false, false, false, "d33f549ebc32c5fccbef0f711d770410e520fd7f845457cf88673678e80dfe64"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 214352, 384, 2, 32, 1, 0, 1, 0, false, false, false, false, false, false, "59a392ca26bd31a3d0e312fa65c24eb6ee205bf186b01147436bf53a6fc92ddd"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen", 214352, 384, 2, 32, 1, 3, 1, 0, true, true, false, false, false, false, "53a915b1f66d828a0ed94d8be23c842872f16372c6e9c3aa44a2f63e8559dba8"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 1, 0, 0, 0, false, false, false, false, false, false, "53dddd8d976682e0d3c989933ffba5c2cf6785ba70450fd05693930bda8ec489"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen", 214176, 384, 2, 32, 1, 3, 0, 0, true, true, false, false, false, false, "5e57ff573b7dcd2ece5bc1c80177afc6a4cedc5a69075f669641bd2d1f0d9a3f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 166160, 512, 2, 32, 1, 2, 1, 0, true, true, false, false, false, false, "348ec593c02a317b722b93d9de6348a0733e96a0f32ebc2c8ee37cf0eb7b7c3e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 32, 1, 2, 0, 0, true, true, false, false, false, false, "7cc71ced9e8ed7bfd962b0931a0baac5bc059019ae149e32d1c79a9da8be2f9c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 32, 128, 32, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen", 183568, 512, 2, 32, 1, 2, 1, 0, true, true, false, false, false, false, "2cb4fbb57c0d4662b7e537f88972967555a81cad5c136fcc5b56b04da2bbf6ab"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 166240, 512, 2, 32, 1, 2, 1, 0, true, true, false, false, false, false, "4948b4da2b43d00868db3beccf02f3af6bebff7f3b23fd99bd964bc243a57149"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 32, 1, 2, 0, 0, true, true, false, false, false, false, "9e277a27c8134659216fe71457c2313e381c74801d8d7460ae0a4ab2dabf214d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 32, 128, 32, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen", 183648, 512, 2, 32, 1, 2, 1, 0, true, true, false, false, false, false, "70874ae517cfe5082fcae80377ba2a38825f74c32499c4856621235aaccb0d62"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 32, 128, 32, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen", 175280, 512, 2, 32, 1, 2, 0, 0, true, true, false, false, false, false, "0575082a38eee72842674280705f9361112422bd8ce42455c88670f09c95099d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen", 181504, 384, 2, 32, 1, 3, 1, 0, true, true, false, false, false, false, "ffa09ce097947ff5232f225d1faf618da12264f4ae948c8f16896c17694fabb5"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen", 181584, 384, 2, 32, 1, 3, 1, 0, true, true, false, false, false, false, "c71f33b782120b552a3f47fdf1c84af52ad1962194b389bf7795fe281eaa238f"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen", 181408, 384, 2, 32, 1, 3, 0, 0, true, true, false, false, false, false, "466ae6a99e1b63a45eb410f60675cb348536196dbd19cc2f57c477a3ce2f5db8"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 157456, 512, 2, 32, 1, 2, 1, 0, true, true, false, false, false, false, "e20f2c5f72c68436533ab1f0888906dcaf0afcd182d547c1d20302af37227f89"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 157536, 512, 2, 32, 1, 2, 1, 0, true, true, false, false, false, false, "7325eaf4009442f13ea08e488f81c69d5c349edf7a090c2e18016d3bc9446046"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 32, 1, 2, 0, 0, true, true, false, false, false, false, "d6f570bfdcecde5e6f41de3710b6308eac6561712d5f6ffe28e5592479817ffd"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 32, 0, 0, 1, 0, false, false, false, false, false, false, "432d1e522288cda2f581a18c3ecbdbfe71c9c624d794d068a8cc9a579b8978de"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 214368, 384, 2, 32, 1, 0, 1, 0, false, false, false, false, false, true, "f4db18c9dd6d50f048fd39a6f0e2d8d555c688ce600605bef5859f8b0cd9b4e5"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 214192, 384, 2, 32, 1, 0, 0, 0, false, false, false, false, false, true, "3ed16368b09775eb98bfadb821a1421aa36b9218e942c4611c830519e3e9d5e0"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 196376, 384, 2, 32, 1, 2, 0, 3, true, false, false, false, false, true, "876a8e44bc81393623b8c584fe6c25e39388b62b551480343df51e568d4775de"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 189592, 384, 2, 32, 1, 2, 0, 3, true, false, false, false, false, true, "5cbfbdf283f64d15c27ddae562bde07bf8ee2749ffb7b8c578f5d971c6a21b0f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 163088, 384, 2, 32, 1, 2, 0, 1, true, false, false, false, false, true, "b1447b54bbe95130244b1a6c5ff2936b95950d30e4ea0f0199cae876fd813ae5"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 156304, 384, 2, 32, 1, 2, 0, 1, true, false, false, false, false, true, "48d9da6a4761afc8fe7bb2499f8fb9de9498e2de40f2f7899fe5eb842862680b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 214352, 384, 2, 32, 0, 0, 1, 0, false, false, false, false, false, false, "a51acf47e2bd2f0ce3b3378abf5c29ed5a4fb5aa27937a8f864642fd2d291321"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 0, 0, 0, 0, false, false, false, false, false, false, "93e33c074c846b367a757f969bddbd89a80719d1177afb594c65583b05adc6cb"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 214368, 384, 2, 32, 0, 0, 1, 0, false, false, false, false, false, true, "ddebe60e0f716bb6cdd7c1b5b880a64e15b5d13d3cd67cfa076990e256674742"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 214192, 384, 2, 32, 0, 0, 0, 0, false, false, false, false, false, true, "54c6573b1f037fe4bd51c1282a4b0a4e8311ee3bbf05694778fd14344860e762"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen", 167360, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, true, "98e442675191a2eb800117e3adb824a97f20bce2f534c740d9244665ae91624b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 163088, 384, 2, 32, 1, 2, 0, 0, true, false, false, false, false, true, "e87f6a5990223a7b7369072b37c2bf8ff0f4485041befccd9c5805cf4b9b9002"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen", 158528, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, true, "5432d637d79f98df7546953584898d35c4e299bf4ea93b9280162e0553def2f2"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 156304, 384, 2, 32, 1, 2, 0, 0, true, false, false, false, false, true, "37a24826e1e1dc5754efa0030181fe5f80c226ab57d12feb91d90dae887a6f17"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 214208, 384, 2, 32, 2, 3, 0, 3, true, true, false, false, false, false, "6d845910bf3d22dcb77a3ad3de7bd20061b526e01e2bf3c3d3fb48e521e98449"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 195256, 512, 2, 32, 2, 2, 0, 3, true, true, false, false, false, false, "c2617c705c9188e597292e852b8d8eb3e7f0dc71e6bfd4d4478b71595a340384"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 195256, 512, 2, 32, 2, 2, 0, 3, true, true, false, false, false, false, "3f9d2a871346151cff5973aceee37cf71a640f30655de5a53c806c8034ef29d2"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 32, 128, 32, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen", 208568, 512, 2, 32, 2, 2, 0, 3, true, true, false, false, false, false, "796bd1afb9d61d2512763923657d37c3b84f8568b077d024842d0c652c5ce11f"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen", 181440, 384, 2, 32, 2, 3, 0, 3, true, true, false, false, false, false, "e7fa53eccf70d0cdb232005f59d49ee2ea81a3090080902b431af0c0b275af8d"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 188600, 512, 2, 32, 2, 2, 0, 3, true, true, false, false, false, false, "982093038e061abe81b65096150c8b69c2bc64d83b886c369be30437ea0ec6be"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 196376, 384, 2, 32, 2, 2, 0, 3, true, false, false, false, false, true, "f063c3a13ae934d70e38675a3eb71a56eacba297b3a4f5bb31dc3e51b0042eb5"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 189592, 384, 2, 32, 2, 2, 0, 3, true, false, false, false, false, true, "1d2b19a19287e0bd7211b141bcf12998e82fa005e0c000074f99b210d4ec8c5b"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 214192, 384, 2, 32, 2, 3, 0, 1, true, true, false, false, false, false, "69e829d1776f638650673e84a0f359552120734fb5210b161ac5153a83165de5"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 32, 2, 2, 0, 1, true, true, false, false, false, false, "ea4747c653f002c13f49e5141de35df692987232b569671a9bf4b72c3851f4a0"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 32, 2, 2, 0, 1, true, true, false, false, false, false, "39dd3b4d72b968d1a8afe983a465263375956263656706ab5519757dbbf42879"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 32, 128, 32, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen", 175280, 512, 2, 32, 2, 2, 0, 1, true, true, false, false, false, false, "257ed4575ea76ec0f5c19a1bb1aeef121a21c2fa9e69cf72fb251fb391c801bb"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen", 181424, 384, 2, 32, 2, 3, 0, 1, true, true, false, false, false, false, "f0cb47e14d939de05a6bbd89371c08aeb3de89355da41b90d418d99bd967a1c6"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 32, 2, 2, 0, 1, true, true, false, false, false, false, "1b0144bc45fc601268e7e45bd64a4e62b862a9fc9611df7d0e7c339b9b2c1ac6"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 32, 2, 0, 1, 0, false, false, false, false, false, false, "86a9a342228cc4db0186d0653b236b965f62834975f382a637c1faa184544288"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen", 214272, 384, 2, 32, 2, 3, 1, 0, true, true, false, false, false, false, "66c4f7ea78f10bdb5a14bf56ddebcbff38278cc92110a71f4c0f6ebcd3f93f8c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 163088, 384, 2, 32, 2, 2, 0, 1, true, false, false, false, false, true, "7c4a7c472d831618d3dac3bcbd989f6cba5a9587d7eb5bbb7bdc50c6dff9ebf0"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 156304, 384, 2, 32, 2, 2, 0, 1, true, false, false, false, false, true, "6dac5099946d86f3154146268a07011ca234e46595a9b1aec55f5e79b4707590"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 214352, 384, 2, 32, 2, 0, 1, 0, false, false, false, false, false, false, "5e92abfff29a8e57630e31430ae0e5355de168b440130c8fc7bc88b560634017"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen", 214352, 384, 2, 32, 2, 3, 1, 0, true, true, false, false, false, false, "ca27804fe1d9fb5f80dbdda141c1bc24c2c6e76542db43e7ba331629769ce76a"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 2, 0, 0, 0, false, false, false, false, false, false, "b213cc5648994a7c586377e01e03ba8abce1df7a6fa31c5b7eda702ecaec492e"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen", 214176, 384, 2, 32, 2, 3, 0, 0, true, true, false, false, false, false, "9e88010c11e4c9ee0d1a5dbe0c18a9a270293e2fe879afb4ea6e3d5d996cc1e1"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 166160, 512, 2, 32, 2, 2, 1, 0, true, true, false, false, false, false, "c260bec436c60caafd54cbb7cab2d2ec210d483f1192bc13d447c776422f3207"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 32, 2, 2, 0, 0, true, true, false, false, false, false, "23b5a18f2b1231d343f51dad9d6d2fe6fea214a2dedd692eacfac16ee7063f5c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 32, 128, 32, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen", 183568, 512, 2, 32, 2, 2, 1, 0, true, true, false, false, false, false, "9badab44fc7f576bfea578616c29dcb63a4535d8b72b40096f371e973d3e6657"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 166240, 512, 2, 32, 2, 2, 1, 0, true, true, false, false, false, false, "df1c0f9f33bcc5d7d77102e288b535f4b86380892eefe01970254f5d02ce29f1"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 32, 2, 2, 0, 0, true, true, false, false, false, false, "d38d07ee813fafa71f74301f68f1339845b216609f123a4f7adcf0ede78bf4b5"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 32, 128, 32, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen", 183648, 512, 2, 32, 2, 2, 1, 0, true, true, false, false, false, false, "2330364d4542665629de4133a9e2af8b2a1371e897c12abad4fa1663139b7062"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 32, 128, 32, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen", 175280, 512, 2, 32, 2, 2, 0, 0, true, true, false, false, false, false, "53cb5402049ec50b4bd06045a431ebfcbef92872565be2961fd40ae5e8a5042d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen", 181504, 384, 2, 32, 2, 3, 1, 0, true, true, false, false, false, false, "eb65f70074b79394b0e05fc380abaf54a2de310c5d9e4f790fa72200cb78fed8"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen", 181584, 384, 2, 32, 2, 3, 1, 0, true, true, false, false, false, false, "b2dca07f2e145b8b78b8a1a31e7665be1b2f6f2a86d4259b8ea6ae680fbf3728"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen", 181408, 384, 2, 32, 2, 3, 0, 0, true, true, false, false, false, false, "dfce7a14d6ed6ed479b791b002261c22c6978efa33d016c946438a1522da71b3"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 157456, 512, 2, 32, 2, 2, 1, 0, true, true, false, false, false, false, "47ff9dd7bfc694ae9e1515ae62937212c564ca33cca6815796183963283abbc0"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 32, 2, 2, 0, 0, true, true, false, false, false, false, "8d9aa8b30c7387f28b514053ade42fb80eb4a7007b57eb7b5decc85101d0ebe9"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext", 41296, 512, 1, 0, 1, 0, 1, 0, false, false, false, false, false, false, "c215a6936c6e6291f2973d30d334c169ffebf22e78c68e1b255414937bd1f8b0"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 157536, 512, 2, 32, 2, 2, 1, 0, true, true, false, false, false, false, "d345278f0eb17e6f824a9d5263c18a2cbfd4da5c7c5cf72ea9d96ffb38ac3125"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 32, 2, 2, 0, 0, true, true, false, false, false, false, "3729f54a6455e74201a098291e7b430555766cb55c988b5253a35d4938b032c2"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 214368, 384, 2, 32, 2, 0, 1, 0, false, false, false, false, false, true, "ed362118cc97da36593002cd00b9c51eda04d71de1ce0205e3ae091101a61043"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 214192, 384, 2, 32, 2, 0, 0, 0, false, false, false, false, false, true, "3bb1ed5a3e2bd82d45a57f84d53c5d8a239e2cbfcdcf9331e951c87a6dddcc99"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen", 167360, 512, 2, 32, 2, 2, 1, 0, true, false, false, false, false, true, "b57b2954a1715f2984e6f1c7f73c8e3a0061583a3a42262e8155b4b13f9bf73c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 163088, 384, 2, 32, 2, 2, 0, 0, true, false, false, false, false, true, "0d4d5cbd4840e8d3217cd3c513846a8d7e9c42a7eafbef6799f1888025b10ef3"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen", 158528, 512, 2, 32, 2, 2, 1, 0, true, false, false, false, false, true, "b59a5eb6b6c78f9567c1f77ddc807b0ba0d673be585d1ba31fd28e56018e0f7e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 156304, 384, 2, 32, 2, 2, 0, 0, true, false, false, false, false, true, "4c279bb10d50abb62a86a45a409d33973e73810722001d3d75b2a8bb47300fae"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext", 41376, 512, 1, 0, 1, 0, 1, 0, false, false, false, false, false, false, "af9fe8d17ba6c16f2cb8857c265203fbcdd807bfc8781b14745608c8fcdf39c4"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 1, 0, 0, 0, false, false, false, false, false, false, "dbdc7d2e5c09281776d7bedb32c4f68eeeb220d59f6676736b0120addc6db1c3"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 41312, 512, 1, 0, 1, 0, 1, 0, false, false, false, false, false, true, "d0f56f024119e6e78a783956820dd8144936cbfe0d645626c41d74d1083e663f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 41392, 512, 1, 0, 1, 0, 1, 0, false, false, false, false, false, true, "90d0e33c1f0e8bf44628d7a35a602ab3fa51d46e6b31a1ce3ce3e5ae12356fdf"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 41216, 512, 1, 0, 1, 0, 0, 0, false, false, false, false, false, true, "e09655404c8ac95cb2b92fc5bb20c3e1cf6eab250dabae63e04121602a94eec5"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext", 41296, 512, 1, 0, 0, 0, 1, 0, false, false, false, false, false, false, "4381bef31b70d1b2bf80c94defc6cf9879b81d9fab9af25fbce7f5f6f1607256"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext", 41376, 512, 1, 0, 0, 0, 1, 0, false, false, false, false, false, false, "0de6e5e65de13479706d8869f0c49efa23259e5d87acb2a07c44954777d59a96"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 0, 0, 0, 0, false, false, false, false, false, false, "2ef0e119d181cf6d944198ec2a44a9c42e78a9de591b78e576f24b81caa9765e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 41312, 512, 1, 0, 0, 0, 1, 0, false, false, false, false, false, true, "c15952bf59512b106fc1c007c783386ac235e08a5371ad8f40204ef0c8a3fb4e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 41392, 512, 1, 0, 0, 0, 1, 0, false, false, false, false, false, true, "08aec7867cb88d82a633f30073492f99543c1f7151e5c1d25fd9e213f879e3d1"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext", 41216, 512, 1, 0, 0, 0, 0, 0, false, false, false, false, false, true, "ebcb59068d963fa8e620eadec483553f22adb26ffae5ee785fc4f2c49a71bcc4"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 41296, 512, 1, 0, 2, 0, 1, 0, false, false, false, false, false, false, "958dcd67c7144b71966db180c5b5b633dd6a73a29b8306a8ee9b020b2d07b92c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 41376, 512, 1, 0, 2, 0, 1, 0, false, false, false, false, false, false, "01aa8425c7a0fa9d42eeca4bddca419a18f7935c251862c5e650bf2a24cf2303"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 2, 0, 0, 0, false, false, false, false, false, false, "f3748637877b5216b6ee19d8a27a4dc39a7a390aa9fed17c91a68605580768be"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 41312, 512, 1, 0, 2, 0, 1, 0, false, false, false, false, false, true, "7ad2250a2f268ca05ba0fad627c4ba911c006e9c44f9de5150036fcbe71ab70e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 41392, 512, 1, 0, 2, 0, 1, 0, false, false, false, false, false, true, "d45a45130b49e7960e3841fcb6d3674642fec8a2fb00f97f3a7ff631a756024d"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 41216, 512, 1, 0, 2, 0, 0, 0, false, false, false, false, false, true, "d37f86e648d3c567b8074c00eec5e55b51d5ea371edcb62b18e0c0995158e475"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 157008, 512, 2, 32, 1, 3, 0, 3, true, true, false, false, false, false, "44a5d2cc28fd96c3c40323905a6471d05062064087d36940e054a71dc4ac1641"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 190792, 512, 2, 32, 1, 2, 0, 3, true, true, false, false, false, false, "f7d2dc8a222b17a4d90cbbf3d3c9041d0fb121c2fe6c7fb557a80e75cd4e54c0"}, @@ -4611,322 +5555,422 @@ static const TllmGenFmhaKernelMetaInfo sTllmGenFmhaKernelMetaInfos[] = { { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 32, 128, 32, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen", 163136, 512, 2, 32, 1, 2, 0, 1, true, true, false, false, false, false, "abf33cb39149fbd37a2ac19734632373f011da3f820ba6e929bee34467c9676e"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen", 152896, 512, 2, 32, 1, 3, 0, 1, true, true, false, false, false, false, "e8750f50fe73ab8c50a1280be705d8e31562afb38fcfb2b2de3a1fb5fa7c444d"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 32, 1, 2, 0, 1, true, true, false, false, false, false, "11e50a958a78e3c2b12d33188554da169d766da35009d6c3aa41bd981eea2e34"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, false, "69abebbbef419302f42f679f78fd9d767063b7744df57b5cb45c54e5fee50b06"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen", 165280, 512, 2, 32, 1, 3, 1, 0, true, true, false, false, false, false, "d147297d699461b2d5c73fe631416594c2d5767d40ec404662ee998e18c453e6"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 42240, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, false, "b66f1cbbff16fe3260f10dccc874ee8b5a49c942b6feeb939465f43165256d31"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen", 165360, 512, 2, 32, 1, 3, 1, 0, true, true, false, false, false, false, "9c58f27e2a9ea9cdbf31c816576c450826db79402a72a09075d6022df025568a"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 1, 0, 0, 0, false, false, false, false, false, false, "4a715e6e48a8c0d8c7b3f8e1a504e0c4e72e28ec66439bfb0e5c07479e54f94a"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen", 156976, 512, 2, 32, 1, 3, 0, 0, true, true, false, false, false, false, "3a4d29333f7d9a9827f0a045185c5fa929fedc531db20582db032b9c3238e012"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 158112, 512, 2, 32, 1, 2, 1, 0, true, true, false, false, false, false, "d3e5fbeaa7cb84d839566aeec7089ef019cd917cce47008ccf6dddb02b9700a2"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 158192, 512, 2, 32, 1, 2, 1, 0, true, true, false, false, false, false, "6a521c090593bfce3baf041188429610c77fdd06d27bc44578f559c6398fb381"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 32, 1, 2, 0, 0, true, true, false, false, false, false, "6ee519536873fddbbbe32f3b2a6b4806feaba2a7e90a4a699346a9c82d08af70"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 32, 128, 32, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen", 167328, 512, 2, 32, 1, 2, 1, 0, true, true, false, false, false, false, "7f582ad16d0501daebd5ce808a0c61718fae81d961b7dcf512bfe55ff50cf9f4"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 32, 128, 32, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen", 167408, 512, 2, 32, 1, 2, 1, 0, true, true, false, false, false, false, "1d0e1eec3c5d8134e5aa3cc9fcb70fd66742774152502104194e38ab8a96557e"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 32, 128, 32, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen", 163136, 512, 2, 32, 1, 2, 0, 0, true, true, false, false, false, false, "2fed6b51095e871ac44ea9d4569d59124f329b66d2e3cd733734cd9562535508"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen", 157088, 512, 2, 32, 1, 3, 1, 0, true, true, false, false, false, false, "02e79dae12b87f4b45aaf4e44a94e0b3a79eafcbddbe1328350fa20830886b62"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen", 152880, 512, 2, 32, 1, 3, 0, 0, true, true, false, false, false, false, "3bdce2d53ab181d6b9352bb814460dce22a5e67be4732bb0ea2d8b790340f037"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 153504, 512, 2, 32, 1, 2, 1, 0, true, true, false, false, false, false, "908ae1642e794886a017c07a84011555df932dbdb2454b3a8fbbc27c2fcb7c36"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen", 157168, 512, 2, 32, 1, 3, 1, 0, true, true, false, false, false, false, "62e01c4b33658c714153e2a2da3ebb3fc4accdc455f1337ecc4b47a6f7b31df4"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen", 152880, 512, 2, 32, 1, 3, 0, 0, true, true, false, false, false, false, "a63f4d1a972b7601b2ab81920ea3d65fffd258fb759ff56795f702cc22df83d5"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 153584, 512, 2, 32, 1, 2, 1, 0, true, true, false, false, false, false, "50718f8dcaaa5563f39ee03483b8f6150ce5d2c50b9289ff7a39fde3ef88d55c"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 32, 1, 2, 0, 0, true, true, false, false, false, false, "7840968d73046ba1f76787353546e2ccf6baa8b5ee386f06dc2e483fb2d47520"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 42176, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, true, "23b88752752e496ddb4f33d2846657fd3dee3f1508102be20bdf2b84327b5b16"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 42256, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, true, "6556cfd5bf02d618abe245acaa84fb8f9c474ab2ee08e532f03a7333776912c0"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 42080, 512, 2, 32, 1, 0, 0, 0, false, false, false, false, false, true, "14033c236b214d142dd2c5ed17bc744debfd60407ea8b8221537024c9ae32761"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 157008, 512, 2, 32, 3, 3, 0, 3, true, false, false, false, false, false, "4a342ccc04ef6980ac241343744e30b7db448375a671b1f72022728d3c1b731c"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 157024, 512, 2, 32, 3, 3, 0, 3, true, false, false, false, false, true, "0e73f3e6306d2c17f6092d544b2d1ec3d407258be722f1d4b9ac00d45b9694d9"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 156992, 512, 2, 32, 3, 3, 0, 1, true, false, false, false, false, false, "12f612c518679afca1d6c51f0c50649de98494f53821c7ecd902a999ecb15212"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 157008, 512, 2, 32, 3, 3, 0, 1, true, false, false, false, false, true, "ec21fe53fa30d413868dba5191a657ec5a2165a84584e167169bab94bcce903d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen", 165280, 512, 2, 32, 3, 3, 1, 0, true, false, false, false, false, false, "3878e323dd96e6b7e8c5e36e069b30325e9539a1e87bdb31ef7906e50be0ba86"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen", 156976, 512, 2, 32, 3, 3, 0, 0, true, false, false, false, false, false, "8b21f9c3ec98b4f1c58c79c5bf4736e331120d312d44085401d0be8b9177dcb4"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen", 165296, 512, 2, 32, 3, 3, 1, 0, true, false, false, false, false, true, "4cc2c809c6a8f19486a1eb91e72be8a6ef68159f17874af50bddcdcf935bb67b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 156992, 512, 2, 32, 3, 3, 0, 0, true, false, false, false, false, true, "553acdd6c9fdfe0df85d5c65066b8cd0239fa53517463a00cedbe609fbf7ed05"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 191912, 384, 2, 32, 1, 2, 0, 3, true, false, false, false, false, true, "8916eeabb08cfa340c7367d6e14477770736da49c261603c36e7a2be187def2f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 188200, 384, 2, 32, 1, 2, 0, 3, true, false, false, false, false, true, "7208922308adc48f3b98c0f7a59bfc99b31de28c0e044c47d365668d4c96841d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 157088, 384, 2, 32, 1, 2, 0, 1, true, false, false, false, false, true, "bce4ed7ea5e97cfa56a098f336a7cab34e01c6f4f5d03eb406b0a4d388a7dd03"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 153376, 384, 2, 32, 1, 2, 0, 1, true, false, false, false, false, true, "f2dd5d0b8011086a7a3b854d47abacf21963bd2d31e08df52865dce4125f79b8"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, false, "3414cb366afbdaf9ace34df1b0315ef89ee1ab313ae78088dbe1bd28f6569734"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen", 165360, 512, 2, 32, 3, 3, 1, 0, true, false, false, false, false, false, "7511e69c27b07dfd0a9bda7c1d185ad3eab2e55ec8239aca2dda60112e508eac"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen", 156976, 512, 2, 32, 3, 3, 0, 0, true, false, false, false, false, false, "fd2e331913fd60e2413af2558a464d8d758bc0a9dd4c2d2ea7d06d5676c492a9"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen", 165376, 512, 2, 32, 3, 3, 1, 0, true, false, false, false, false, true, "c4fd84de554a59e428ad28356cf8128f833b6ba8fb33f3d5e1aa5c086c416176"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 156992, 512, 2, 32, 3, 3, 0, 0, true, false, false, false, false, true, "ea4ff53042bbe6299b66dcaa22b3635e3f3dfc4146c1a9d7271c2624aacf585f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 191912, 384, 2, 32, 1, 2, 0, 3, true, false, false, false, false, true, "ad78412cfcfad128a315dcb4144fb4056ae409e807aece858969876b0f870b65"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 188200, 384, 2, 32, 1, 2, 0, 3, true, false, false, false, false, true, "a754966fd77391d5635bbf218c47956a2ffdb67d6476c8948bc6931859d8d36c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 157088, 384, 2, 32, 1, 2, 0, 1, true, false, false, false, false, true, "8e5069dc4e8c41ccc46ddb67aeadc0c8922bb7380ec1556169b1717abe2647a9"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 153376, 384, 2, 32, 1, 2, 0, 1, true, false, false, false, false, true, "20818ee2b305a94f3ab8fb3ea19d22c7ccb41b6db36ff45c394f96da3512a2ff"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 42240, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, false, "3df4244778b00658310fd9bb41504a835e7861c3b9a881da9ed87f76b8b8903f"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 0, 0, 0, 0, false, false, false, false, false, false, "f0c95b052bb25aaac8ebbc3c8d18e1c1a57c60862d621039949502de162c230b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 42176, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, true, "ca07060dd35c099bba6bc7d20f2d62007a77979f50adcf1191ddab7c9c24dbc1"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 42256, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, true, "7ef9156487c5c199728cb8a59eb9ba01a4bc98425a715691e7c35875b99167b4"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 42080, 512, 2, 32, 0, 0, 0, 0, false, false, false, false, false, true, "599395b67070cd840cfbd4d99bce9b4291a39863c163fc36fd35382352eb46a1"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen", 159232, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, true, "40791662d199147c2258719f0d52ecab2deaf65f56bb28926752fc309e36f2d8"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 157088, 384, 2, 32, 1, 2, 0, 0, true, false, false, false, false, true, "79a351a05547e7e2842d1d7abf20bd565e35fca80ba58fcb691cc3dd3ef5ab7f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen", 154496, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, true, "f9f35c8352314f5a0a9e87df4def9b10ac57f41358e31bd6455e03e7a248116d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 153376, 384, 2, 32, 1, 2, 0, 0, true, false, false, false, false, true, "3591637080ee4b5f5d00ac724d37759403017974dadfd6f96e4515f6549783af"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen", 159312, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, true, "6817ac9b8d851b76392d08c96beae9d6628da11010c2822e513dda300b10e101"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 157088, 384, 2, 32, 1, 2, 0, 0, true, false, false, false, false, true, "ca7b3720e17559a405ce3d0f82624c7bf2807f17d88e2a46e41fd75edc5e1152"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen", 154576, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, true, "ff9312a3da7ef1a65f43e4e15b0858e2ecbc78583def64178e85b9d67baad98f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 153376, 384, 2, 32, 1, 2, 0, 0, true, false, false, false, false, true, "61e27975741c051dccbdb7af50c4cfcd5cc89b6553d54da2f9633a571eb09450"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 157008, 512, 2, 32, 2, 3, 0, 3, true, true, false, false, false, false, "9ee8db69d59239362be6ca3d196978e071dfd6f2fefe7ed61136d3c13ddbd579"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 190792, 512, 2, 32, 2, 2, 0, 3, true, true, false, false, false, false, "e3098c7eb4e9f8a1b581a7c7668ef23d569c0c450a97f5d43b1e4684615a9a78"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 32, 128, 32, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen", 197960, 512, 2, 32, 2, 2, 0, 3, true, true, false, false, false, false, "8e8871743e2272c5a9d5e84bfa35a2996753fe1fc10960d194cca556bf267223"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen", 152912, 512, 2, 32, 2, 3, 0, 3, true, true, false, false, false, false, "05a0dc4be23af799d287fe2f33cc47b7063709a71f9a979beccfaf36fcedd611"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187208, 512, 2, 32, 2, 2, 0, 3, true, true, false, false, false, false, "04428a6ae3c0f2949b5c64ccc8e76f18afd00b7710ff2f743507d83136c022b3"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 191912, 384, 2, 32, 2, 2, 0, 3, true, false, false, false, false, true, "12fcf6388434ccc9cab5476ef7c30f3cd3aec452f7835046ec5bc38f3765c36c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 188200, 384, 2, 32, 2, 2, 0, 3, true, false, false, false, false, true, "94328d4378f1f8bee8b9c26bccf05da343e8a1b2a94eb457688cbb5d5ad2a718"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 191912, 384, 2, 32, 2, 2, 0, 3, true, false, false, false, false, true, "cec96c63c3ea574c92dbc8b6ab7078ef7ffb5978e5315eef17b3492c2914e700"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 188200, 384, 2, 32, 2, 2, 0, 3, true, false, false, false, false, true, "ab73a50d8aa0fa3aefb107a71363cd29d45d1f0a09ed29e803dfdfedecb98e28"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 156992, 512, 2, 32, 2, 3, 0, 1, true, true, false, false, false, false, "7f83adf900c9e8f9cb7f261b926b6136de31776b3f501840ce3b1b57d744b273"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 32, 2, 2, 0, 1, true, true, false, false, false, false, "e479390f088d810fcfb2e3678dd51feef345829e76ffb03f303a75cd911218d1"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 32, 128, 32, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen", 163136, 512, 2, 32, 2, 2, 0, 1, true, true, false, false, false, false, "1149f2b2e07463116b7f2d767fea7de1577cbc488e972c1262dde7b8414c3653"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen", 152896, 512, 2, 32, 2, 3, 0, 1, true, true, false, false, false, false, "0db2ac46057fc824ee71a69bf68f207eb295cf0d0f6ddaf41d5581f3d7dcabf1"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 32, 2, 2, 0, 1, true, true, false, false, false, false, "b104500313217ac1fb1e9c80ea2293d346f339d6da098bb75e3d2d5373b4f619"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 157088, 384, 2, 32, 2, 2, 0, 1, true, false, false, false, false, true, "0a5a552e8dc2af22546c7c1152931d23b60bc2ac52768b8c5ec80cff5afb5e18"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 153376, 384, 2, 32, 2, 2, 0, 1, true, false, false, false, false, true, "7856d2a3801e0b0f1a9e63f4a76893ee94401c1f2923740d017225aadd2086a0"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 32, 2, 0, 1, 0, false, false, false, false, false, false, "890ce5f1cbdc5a21fc924617a4bf2155253bc710e47d6d7b03d7951e0720e44c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen", 165280, 512, 2, 32, 2, 3, 1, 0, true, true, false, false, false, false, "22c1bd3cb3c62b4e828eaf17dae30f0892fee70d5f40d17cd518a510842ad1e2"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 157088, 384, 2, 32, 2, 2, 0, 1, true, false, false, false, false, true, "0322dda254b7f68c3ac4d9a7329820e72e6aa7d1dfff4b96d851d665afd1d2fc"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 153376, 384, 2, 32, 2, 2, 0, 1, true, false, false, false, false, true, "9b2e8dccf764d23538a8393f4cdb8ef54143d6067ef1f6dc4702ee2168a380df"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 42240, 512, 2, 32, 2, 0, 1, 0, false, false, false, false, false, false, "18e918cd05b77f968eb4cf9161c13136d0ea0696473c05f8cbf2dc034baf18dd"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen", 165360, 512, 2, 32, 2, 3, 1, 0, true, true, false, false, false, false, "30ab7de7031b1507205ea22ef84a9b80439101fa57a6cae0875753c91c7b5eef"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 2, 0, 0, 0, false, false, false, false, false, false, "55d5c5defadfea6596e8eb8a982633f6fab4ac1f933b3d5146e545bb9f0e1d79"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen", 156976, 512, 2, 32, 2, 3, 0, 0, true, true, false, false, false, false, "57105eaf5bf47f706c6d9208c6d462e214e052299245f4cca50c431cdc53a431"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 158112, 512, 2, 32, 2, 2, 1, 0, true, true, false, false, false, false, "f41ff4a3f4fb338b8db09e0c1cd2801db215852b53b3b8a1a9255c8a7544a6b8"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 158192, 512, 2, 32, 2, 2, 1, 0, true, true, false, false, false, false, "cddab1e741114e1c0148e62a2cd2be7cca62e05ff4cfe3930bb5df216997d083"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 32, 2, 2, 0, 0, true, true, false, false, false, false, "943994aadecf046b6e503411c7be7648e6e268ce0fea8666fae1f3dce6114d72"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 32, 128, 32, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen", 167328, 512, 2, 32, 2, 2, 1, 0, true, true, false, false, false, false, "82d0b782eec70990a4c7eef39f73a9e26cd6a9b4025032dec3931b4069d507dc"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 32, 128, 32, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen", 167408, 512, 2, 32, 2, 2, 1, 0, true, true, false, false, false, false, "e9df3c5a3d195f90214f04379650805a000c95bea196f9f370f9cfc5c699d9b2"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 32, 128, 32, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen", 163136, 512, 2, 32, 2, 2, 0, 0, true, true, false, false, false, false, "ca1448b330af08c7b87a0e4a2fdad246fc5cad17905f88818e342f087b979f80"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen", 157088, 512, 2, 32, 2, 3, 1, 0, true, true, false, false, false, false, "f0f9ea3f953b9e5cebdd4f0de5b6c21affa054547aa63907061de3b187487741"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen", 152880, 512, 2, 32, 2, 3, 0, 0, true, true, false, false, false, false, "0ec57b17dc1f2934eb27174e4e0852900744d44258ab8feaadaf52db36698583"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 153504, 512, 2, 32, 2, 2, 1, 0, true, true, false, false, false, false, "7b1d60906156e504e3371908fe4b5d157b89f212eb4763410e22cef131709927"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen", 157168, 512, 2, 32, 2, 3, 1, 0, true, true, false, false, false, false, "a5c7bbcdf9277929561e53e552eb4227b073ae359dbb661f0d9af4a48e8bfd9b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen", 152880, 512, 2, 32, 2, 3, 0, 0, true, true, false, false, false, false, "b6f0e408e5b317c6561463185b7c968a409392ef4e901d09c060ceec0c707c68"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 153584, 512, 2, 32, 2, 2, 1, 0, true, true, false, false, false, false, "4c2fa2a084435831d715abc227139472493db3c7c7b848e02a7dc7f40657497f"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 32, 2, 2, 0, 0, true, true, false, false, false, false, "cd107849e908eeee1770f4defb77a4893c7330effd4b3f12ca2fff668bb73390"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 42176, 512, 2, 32, 2, 0, 1, 0, false, false, false, false, false, true, "22def9051948432fe8f5fe54b83bad4cdaedfad4ce196736d52ec58cb05dc943"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 42256, 512, 2, 32, 2, 0, 1, 0, false, false, false, false, false, true, "352798c045418f6ee61afcadc28e79c01f62dd6f11efba0f0a6430e443faebe6"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 42080, 512, 2, 32, 2, 0, 0, 0, false, false, false, false, false, true, "b75c5f2569607521af6900e6247c1746fa419189290640ce7f9abb5c14873dfe"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen", 159232, 512, 2, 32, 2, 2, 1, 0, true, false, false, false, false, true, "c7b1bf08c1b999fa5351dce39e28a01c367320cbd17f38f0a913be9ac761ea92"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 157088, 384, 2, 32, 2, 2, 0, 0, true, false, false, false, false, true, "06e576c24217ecd3b13c1124fc928d93ff783055372ac68e84399dad42e91e88"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen", 154496, 512, 2, 32, 2, 2, 1, 0, true, false, false, false, false, true, "472c40c771e48672751fe94e33e40a2815e2ca5465788b62b573539ffb18890d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 153376, 384, 2, 32, 2, 2, 0, 0, true, false, false, false, false, true, "68983ee3cc6072df10e2d917e4da8abed501dd1a30f2e5005f3b9e68a2fc33df"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext", 115024, 512, 1, 0, 1, 0, 1, 0, false, false, false, false, false, false, "83ad2a867790eedd7a8e682fbc2f6cff9eeb0313e511ad5dd7a4d5f5548595a0"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen", 159312, 512, 2, 32, 2, 2, 1, 0, true, false, false, false, false, true, "9aa822e7d6cf443fbfe20bd2309681371d65f8b825fd69e95589fa7eefb1cc6a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 157088, 384, 2, 32, 2, 2, 0, 0, true, false, false, false, false, true, "c12db3634829f5cb58f2aba05d5df34d05f1fb8eb44a7597d0b01dcd0fce8e63"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen", 154576, 512, 2, 32, 2, 2, 1, 0, true, false, false, false, false, true, "ad70841e38b9bd83e26607ac20f9c39d16eab1d7ce8d1681ba55fddcdf173c23"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 153376, 384, 2, 32, 2, 2, 0, 0, true, false, false, false, false, true, "62acefb3e44397f0bcd8c4e35a2f235c76223a844a57f455b16b7f0d021373ac"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext", 115104, 512, 1, 0, 1, 0, 1, 0, false, false, false, false, false, false, "58dd3e9ad58c8d14df6350ac7b88d9347269d3140d0f7882a361e876bb4e2593"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext", 114928, 512, 1, 0, 1, 0, 0, 0, false, false, false, false, false, false, "85db7440ab53c8688840678176d1393e8827563bd92490fb4b2594d116019052"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext", 115024, 512, 1, 0, 0, 0, 1, 0, false, false, false, false, false, false, "8a78a7b6a610c8cd46489ed7de49b577651c6c44070d964b51bf8b853c89c977"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 115120, 512, 1, 0, 1, 0, 1, 0, false, false, false, false, false, true, "ab17d41ce84e294cd6554d7b3fafeae1d9b9930ca2999a121038d8e7983335f9"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 114944, 512, 1, 0, 1, 0, 0, 0, false, false, false, false, false, true, "38c8382dfd769886775a30e42b40bd9a91a92ca0ac6d0713293518491927607c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext", 115104, 512, 1, 0, 0, 0, 1, 0, false, false, false, false, false, false, "005ec6b5811584d41f85409d8c1ce1a9202d8465e08320d4edb87771ae75d6fe"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128StaticContext", 114928, 512, 1, 0, 0, 0, 0, 0, false, false, false, false, false, false, "83a0d221129f4daee8ae964ba00be0564197a0052f01d2d84ca004d4cf498c15"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 115888, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, false, "3b37b73fa55e751052e67ee60de4be18c34af403babdc94b7cb1d2ac71598cbd"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 115120, 512, 1, 0, 0, 0, 1, 0, false, false, false, false, false, true, "a45454b2c97aae2baa8d5d28f4c6d28ceabd5259335d79190e01d5a160772d88"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext", 114944, 512, 1, 0, 0, 0, 0, 0, false, false, false, false, false, true, "f8a5b1358ba2fdc769dfc29d613d6c91f376e4c57bfd2056136647ddf9f5cfe9"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 115968, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, false, "ef956a8097d93f7b3b2066b2e3e8883e83dc72d27fdac93c6a55cb0239f7a28c"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128StaticContext", 115792, 512, 2, 32, 1, 0, 0, 0, false, false, false, false, false, false, "8ade6c1c0537542b4c75991d8ebc14b438309c677fc1dcbde4d0d5c13fcc2a83"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 115888, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, false, "3b11b25aa2c077f7207f50adadd28b9856aba8ccaeb9b1539302c661d5567fe1"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 115984, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, true, "2c17fb566f3a61aa490131518df840a41ca7282ee0207044db6f698bd5ae701c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 115808, 512, 2, 32, 1, 0, 0, 0, false, false, false, false, false, true, "9f47a3668a7311bfaf0aebdeeb4a23bbf56e044598a74095a21c6f0ecfe6cfa9"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 115968, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, false, "4afd9193e3caffe04e0b0a593b9da995554e32e4faa9f6abfd10310f9aa48493"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128StaticContext", 115792, 512, 2, 32, 0, 0, 0, 0, false, false, false, false, false, false, "15fbcc97c69725487bdfdcb06dead249cc2bec8b521ba82b56bcc3eb1cd6dbaf"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext", 115024, 512, 0, 0, 1, 0, 1, 0, false, false, false, false, false, false, "80bab9a20c58d64902f6d40449a30e1e91855a7302b20b127376f6545ac8d7ef"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 115984, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, true, "4eedc3e16976da2749f265f906a9803fb1344b82a52d6b6e77d4efc29d8fa82e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 115808, 512, 2, 32, 0, 0, 0, 0, false, false, false, false, false, true, "7052b59b368b637cef024d097f2827e232683af2e99fc550b5d77160e610d5d6"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext", 115104, 512, 0, 0, 1, 0, 1, 0, false, false, false, false, false, false, "a22b71a9959eeaaad6cc41c1f9272c990b4d94dbc5cb493e35bd7027997f0e4f"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext", 114928, 512, 0, 0, 1, 0, 0, 0, false, false, false, false, false, false, "ad62a060105825a4f5f112750a75cde2ce9d9b1bbed63d277bbdee1d25f40a6d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext", 115024, 512, 0, 0, 0, 0, 1, 0, false, false, false, false, false, false, "221beee0aa05e45d94055d2ef697a742069c8e7581e4f08459037414b56d3ed8"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 115120, 512, 0, 0, 1, 0, 1, 0, false, false, false, false, false, true, "fd3a0523394dc7c50a16cb0d02775f0a17a66dedc1f22fdfe13fa608aa091237"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 114944, 512, 0, 0, 1, 0, 0, 0, false, false, false, false, false, true, "9105389638e454a2f311feaf3cfd922fc0dcf5f471b7aa058423cffcf56c3a5a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext", 115104, 512, 0, 0, 0, 0, 1, 0, false, false, false, false, false, false, "546855995fbe1325d5867d516379b279410f2dd4243fd3d1afba3dbb3871e9b2"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128StaticContext", 114928, 512, 0, 0, 0, 0, 0, 0, false, false, false, false, false, false, "8aa011cb57439e342e800434247603097fbd276e2c54e527a0e01802f4663f00"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 115120, 512, 0, 0, 0, 0, 1, 0, false, false, false, false, false, true, "52089e5ae60e1c1bfbf7aa94c89f4cdd676e7d7408589526b82ce6e6eb331e63"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext", 114944, 512, 0, 0, 0, 0, 0, 0, false, false, false, false, false, true, "468d694b9eefd57fecb2a27fbbd59bf7c7db6d860a18f5c03d243c3499d9e189"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 208056, 512, 2, 32, 1, 2, 0, 3, true, false, false, false, false, false, "5a6c1c7c58740ab613296b0e283acfdddc8cda5f625710fb364dfc02cdcc2511"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen", 208200, 512, 2, 32, 1, 2, 0, 3, true, false, false, false, false, false, "36b229612ef9c279b18dfcc5a58f10491183f4474f5b4b8f76dbcc8ed0c69eb9"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 195256, 512, 2, 32, 1, 2, 0, 3, true, false, false, false, false, false, "c32011975cc79ff713ecf6e0d95717c2acc1bce9a5cdaab5b07ef48a52d7d94e"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen", 195400, 512, 2, 32, 1, 2, 0, 3, true, false, false, false, false, false, "519f74f0cd935df1e564323cc4d9cf9465ed9a92ca6597fecdb38583a271d9d5"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 209176, 384, 2, 32, 1, 2, 0, 3, true, false, false, false, false, true, "df5c24360dc9335463941a95ff4ae2179927539808c047e248a3a8c0a954a410"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 64, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen", 209320, 384, 2, 32, 1, 2, 0, 3, true, false, false, false, false, true, "64d65a81f25bf97778e56903cb47038b230b1ae20d97127c53937716df55ab3f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 196248, 384, 2, 32, 1, 2, 0, 3, true, false, false, false, false, true, "616a4953a71a0bd4ae588eeba9ffdfa890e439feda09f91583e0e16ae3e4b30b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 64, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen", 196392, 384, 2, 32, 1, 2, 0, 3, true, false, false, false, false, true, "6c18faccfc4145d3a43251de7079fafdef8cf192ed44337d34d64a68d804ebcf"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen", 214176, 384, 2, 32, 1, 3, 0, 2, true, false, false, false, false, false, "a0c11348938008f9efb801c63ffa4e56528c8f405361578d4c06c9f0f53e44cb"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvGmemSepVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvGmemSepVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvGmemSepVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen", 214192, 384, 2, 32, 1, 3, 0, 2, true, false, false, false, false, true, "2c885b688ba81278386f4c31246a99809c899b7d8a46a10412a00c7ed00c761f"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 174256, 512, 2, 32, 1, 2, 0, 1, true, false, false, false, false, false, "38ff154a708c021e066e01bb629428c9048f54866452cf4d5d4ac7a1a9f9bffa"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen", 174400, 512, 2, 32, 1, 2, 0, 1, true, false, false, false, false, false, "af9fd855f442382af2dc0a80e94eef27670e98aacd9cbe5ebb3f56aa1a6fbec7"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 161456, 512, 2, 32, 1, 2, 0, 1, true, false, false, false, false, false, "5c6c139a660c328859bbfd06f5998ff600ab0219800d9f770eeae688cf07c8c6"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen", 161600, 512, 2, 32, 1, 2, 0, 1, true, false, false, false, false, false, "be5d4be024d08edc1f8c164352eaed3f224c2c69dbbfebdf8da7a38bf7116700"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 178448, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, false, "b69d91882a1209bc4998e72c03a7ba19a07b870c10dc389c6d9562b5e11d076f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 175376, 384, 2, 32, 1, 2, 0, 1, true, false, false, false, false, true, "109536c943bd79c36e0538bb9d4154760692a801a955c8ae5f62cced43a73769"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 64, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen", 175520, 384, 2, 32, 1, 2, 0, 1, true, false, false, false, false, true, "5e12017c4faa1c123907ce636808fb5e59fd3449cb3ca241d50f1b52fe2e51fa"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 162448, 384, 2, 32, 1, 2, 0, 1, true, false, false, false, false, true, "cd36612277f2dfe34be7397ac7fd34f59ba3cefbcefaf7f444d6079baf1a8589"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 64, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen", 162592, 384, 2, 32, 1, 2, 0, 1, true, false, false, false, false, true, "08a75894bd6f9008715ef084115577efb163b3a7ee5b6fd5db1119f39fd7d750"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 178528, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, false, "622ee0938f07eea02f4915067650994f46fbcef92ca978e1384f425c9ea8c393"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 174256, 512, 2, 32, 1, 2, 0, 0, true, false, false, false, false, false, "6eee74d3d1d1bf763d504be43cac71217dca0007da8b8e595017f0b6db962839"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen", 176544, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, false, "27c48bcdd6468c1e8e86cf15c464e7b943895dff4e6eb3c957a85831e962e286"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen", 176624, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, false, "615b9190237eb4e6d9e1db70b00266baa703489a4f4efc1c96a2e88268d6d73f"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen", 174400, 512, 2, 32, 1, 2, 0, 0, true, false, false, false, false, false, "7a0eeedf1e5da2a1c38d194fc060a5215a4e116fd64e9cbed28916ada1bcc4fc"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen", 214256, 384, 2, 32, 1, 3, 1, 0, true, false, false, false, false, false, "1af963e55f1e2a04ac75582ae997092163d2b8189d2b0f80cbc3c0a8656d8de8"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen", 214336, 384, 2, 32, 1, 3, 1, 0, true, false, false, false, false, false, "954baebaef930e2f4655bdff7b08bd20ac324a3958ee329e89fb4f7c242efa7a"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen", 214160, 384, 2, 32, 1, 3, 0, 0, true, false, false, false, false, false, "cddb024c2b5956772e89e06a71fc0328be2ed83b8c8d09b558065062821e9a90"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 163600, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, false, "31571e8cf2864fb543678c34f729dc297d7a9cbacd990df5ee50e1bec1fc8702"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 161456, 512, 2, 32, 1, 2, 0, 0, true, false, false, false, false, false, "3999413093f7c277aec3f31d7d20d41e452f6052855080ca6edb32ceb51bc5e4"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen", 162720, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, false, "7da128a59984e42e1f2a7ca4a825ce4c1f94aa5ca0bcb234b4eded3670f56958"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen", 161600, 512, 2, 32, 1, 2, 0, 0, true, false, false, false, false, false, "c5e21be28adc3df091d2754dcf05f7b64414ec7fcb54b440318f93528e1fd580"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 163680, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, false, "a00c32a9bdc41f6c3505f7a9c5fbc3e0342e426d7e6f8a7ed80697a7e206a2f1"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 161456, 512, 2, 32, 1, 2, 0, 0, true, false, false, false, false, false, "683979454874df9f0c591ab0a93e700b47e55b2fa37196aa4e8d2deb895b663b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen", 162800, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, false, "34b863a8f98429887cbb4c5e373837ae6902888e4f14285dae7d2feb578effc0"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen", 161600, 512, 2, 32, 1, 2, 0, 0, true, false, false, false, false, false, "f1861383bbcae4fb8f19492aa28406245cb537c6157a4b80bac18664807db7ea"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen", 179648, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, true, "afb33ea260c2c28a75c3afce614c11ecdb201f132f81dc349c3238873ee55b21"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 175376, 384, 2, 32, 1, 2, 0, 0, true, false, false, false, false, true, "4f5f5126ea153b7ae44bd466a087e1395d54efeb526a0e739dcf5b268ea55f23"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 64, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64PersistentSwapsAbForGen", 177744, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, true, "393ef25814beb66bee8421b6e15399f75df982db6cc0d79ad22068ab2a9ad516"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 64, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen", 175520, 384, 2, 32, 1, 2, 0, 0, true, false, false, false, false, true, "62429d7f4f9450dcafc6ac6f080f66c39644e5c7f149ca8c06f8a81c9500d37a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen", 214352, 384, 2, 32, 1, 3, 1, 0, true, false, false, false, false, true, "7a8898444b775c269cae9a5647821e5dab5dff7f73ddd646b0970fac7619608b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen", 214176, 384, 2, 32, 1, 3, 0, 0, true, false, false, false, false, true, "9e3a09dd84029a4c9e6b602809a12535be160126f68941d2041b4c53b3eadb45"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen", 164672, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, true, "7c73ad45dbff35f34a9dabcd31d2e2564de8daea378aa1243a38ddb2c1557461"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 162448, 384, 2, 32, 1, 2, 0, 0, true, false, false, false, false, true, "4c2a5a92f0fd22a2ba661b6d2c0f538b933c44aae0fc45e156d734e09750257b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 64, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64PersistentSwapsAbForGen", 163792, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, true, "f76274b4b7701ce204fc1be8833a98cae927bcf7c0b6d90899a5444e5f827227"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 64, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen", 162592, 384, 2, 32, 1, 2, 0, 0, true, false, false, false, false, true, "683b45ccc9fb43103063a9826db5ce87be5c4d0504a08b86678d0ff07d316969"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 213160, 512, 2, 1, 1, 2, 0, 3, true, false, false, false, true, false, "48cc194c20414f0ac996e7e8057f0dcb86ccd60e07ce8e9e645e19019e84598c"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 200488, 512, 2, 1, 1, 2, 0, 3, true, false, false, false, true, false, "18a607b120dcba446af6d6d895a88de0e496ed997451b8c0fb9d5a05e5b35517"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 179360, 512, 2, 1, 1, 2, 0, 1, true, false, false, false, true, false, "ac74d35cc1eef7ee7c4e30ededad8d612fca9b26afe00b954adac6cf725d4cbe"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 166688, 512, 2, 1, 1, 2, 0, 1, true, false, false, false, true, false, "1f193ef0b94a4359b075074b7dc9f61866b7b5691923127cd84bf9f649f4f173"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen", 183552, 512, 2, 1, 1, 2, 1, 0, true, false, false, false, true, false, "4ffc7229304e0d2fbb21130a30bc024908e3570092f0eaf5b4cf7c71f33d7e6c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen", 183632, 512, 2, 1, 1, 2, 1, 0, true, false, false, false, true, false, "9f32c2609521d92e02e6566f14b1a4a8ec86b957d770ef65ed85be6060c07286"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen", 179360, 512, 2, 1, 1, 2, 0, 0, true, false, false, false, true, false, "5224943bb63c0a2bbf4c7c7300b5d8159ff36d40d1c7ae12c59f03b2a570f73a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen", 168832, 512, 2, 1, 1, 2, 1, 0, true, false, false, false, true, false, "5a168137b34b456bb25d807312ee3a5843a653336ec6ec1e7d62463e6fb70dff"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen", 168912, 512, 2, 1, 1, 2, 1, 0, true, false, false, false, true, false, "1ce50c3eb152ee564c85110d0b690c271b2d32be0145c291c8b7dca3855c8061"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen", 166688, 512, 2, 1, 1, 2, 0, 0, true, false, false, false, true, false, "d7e7108a8df4f35c2fdfa82ddb65ba2ac2cb4bff6cc71000b76a6de949101c0c"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 207544, 512, 2, 32, 1, 2, 0, 3, true, false, false, false, false, false, "183f82a8771f66ef9ae8c51708d7a84ad571b7157833e4aaeaf4cc5e103dd087"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen", 207688, 512, 2, 32, 1, 2, 0, 3, true, false, false, false, false, false, "c421c421ecda45749aba17a21198580b0a3c80cf806ff11cd8f9b434cf8cffd0"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 194744, 512, 2, 32, 1, 2, 0, 3, true, false, false, false, false, false, "5e4ed42021dc1a1151b02e0c480c96fb3f81e17023bc4e5814cc8937fa85eada"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen", 194888, 512, 2, 32, 1, 2, 0, 3, true, false, false, false, false, false, "369e66114b48e15736ca5568aef14620afb2876ca7d7da55e186103a416bdbef"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 208664, 384, 2, 32, 1, 2, 0, 3, true, false, false, false, false, true, "4d4cc7d95a48e439e394f557fb72d2859ec6cb402caa62f2651b93bb7e8dd452"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 64, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen", 208808, 384, 2, 32, 1, 2, 0, 3, true, false, false, false, false, true, "39a32babce9e5aab53e966f0a889f841a56b97c1cb58efe9d6deaef90e09e06c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 195736, 384, 2, 32, 1, 2, 0, 3, true, false, false, false, false, true, "1671dbd5559be50d5f02a020a8ffcf2fbf702b21292db512e37672f287c36f6c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 64, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen", 195880, 384, 2, 32, 1, 2, 0, 3, true, false, false, false, false, true, "d93e5da5033efb764dde10e7a414e8bbb82421ce3de8f336f2c79c1955d09943"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen", 207448, 384, 2, 32, 1, 3, 0, 2, true, false, false, true, false, false, "a30f28c37f86fa8ffc63a9693b8cd3a0fcf7ea3cc7152ffd21b01130ec4389a1"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen", 214176, 384, 2, 32, 1, 3, 0, 2, true, false, false, false, false, false, "88706b72df6ef983b401d34981186dc977ebe18994f033852398a87c1c2b92ad"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqSkipsSoftmaxQ64Kv128Static2CtaKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqSkipsSoftmaxQ64Kv128Static2CtaKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqSkipsSoftmaxQ64Kv128Static2CtaKeepsAbForGen", 207464, 384, 2, 32, 1, 3, 0, 2, true, false, false, true, false, true, "9df7c7b2392dcd1d0d28dc4861296a122450299e100e39fbdd6290dd69d75096"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen", 214192, 384, 2, 32, 1, 3, 0, 2, true, false, false, false, false, true, "70ad39d1791ee999740d2fa28992212d11408eb431fb7f0cafc04d60798c9a42"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 174256, 512, 2, 32, 1, 2, 0, 1, true, false, false, false, false, false, "2ac4175a51d64be2a89c3cc68a35520a3a412ceec23d4a83ba7b6e175df31227"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen", 174400, 512, 2, 32, 1, 2, 0, 1, true, false, false, false, false, false, "f880ca51afd6164e5fe3295ebbeeebaf15f1b07407f01ed6c7fe4fe1ac96cdab"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 161456, 512, 2, 32, 1, 2, 0, 1, true, false, false, false, false, false, "9db636a350bc6e705d7619f162830a41596fbb4d24769b7202bd73933ac1ae88"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen", 161600, 512, 2, 32, 1, 2, 0, 1, true, false, false, false, false, false, "41bfda0afdf8955341874362e86d22c379f5b010d0cf468927767298adc9b9c0"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 178448, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, false, "ce6386bccbad05d93701427ae113cc5be20ebcd57854a5599471d0c68b923cf8"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 175376, 384, 2, 32, 1, 2, 0, 1, true, false, false, false, false, true, "008132e502b7e0cab2522ffd73be120f510b397a10a0fab38cf936c5ecbb9df7"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 64, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen", 175520, 384, 2, 32, 1, 2, 0, 1, true, false, false, false, false, true, "96f18b116497588a0433dd48fc93349cf7e15860c7d676c930b6086e2dd1fe38"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 162448, 384, 2, 32, 1, 2, 0, 1, true, false, false, false, false, true, "bc6625272139108f7a2f8a8c30b7fd0711ce47174c47b73bee363cbe2ea505d8"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 64, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen", 162592, 384, 2, 32, 1, 2, 0, 1, true, false, false, false, false, true, "a9e5f8328eb3db491563bc91c445128a3dd7db364b094cf796a6599db156c2ae"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 178528, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, false, "5c7706f0f720d6530d1b696d412daa6f6efc37f3fd9b8a3a0a91afbf361469b1"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 174256, 512, 2, 32, 1, 2, 0, 0, true, false, false, false, false, false, "46ee55326ad0d3e5bce047453c8bdac0891a548aa475dd82a451b309282983b4"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen", 176544, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, false, "8776c90de29e1afaf205971e2207df9b09e8b8e6b899dd5d761d9547edb5f124"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen", 176624, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, false, "115ab03f7ca4e367ea5740137a7a718fea1aae5f67f5c45df6ad6b6620842fcd"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen", 174400, 512, 2, 32, 1, 2, 0, 0, true, false, false, false, false, false, "8bdd479a33bf0c79613d434ab2bed8ee95b939cac061982b9de19620394d1407"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Persistent2CtaKeepsAbForGen", 207528, 384, 2, 32, 1, 3, 1, 0, true, false, false, true, false, false, "49446a9a98ebc5421b5d35683211dfa88b8e9fd062f6f5f801a3f4e6b857530f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen", 214256, 384, 2, 32, 1, 3, 1, 0, true, false, false, false, false, false, "6737c8e084c8b86c393c5eb4dfa74cb386efc1ba0c27fd0b07a041b9443bfc4e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Persistent2CtaKeepsAbForGen", 207608, 384, 2, 32, 1, 3, 1, 0, true, false, false, true, false, false, "f995d0c7b9de722e693a0e09eac55406f4ca426b3dff8563666abc54d373df5e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen", 214336, 384, 2, 32, 1, 3, 1, 0, true, false, false, false, false, false, "6e8ae6bbe55e8a9c247f567a71047f6d561d50c9fa0a7bcb7866ee242c7b06e8"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Static2CtaKeepsAbForGen", 207432, 384, 2, 32, 1, 3, 0, 0, true, false, false, true, false, false, "3bcb5a155071f8d5ec73367c3f2a1870f9d3b1c470eaa667e9c515fd1b868a2b"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen", 214160, 384, 2, 32, 1, 3, 0, 0, true, false, false, false, false, false, "fd67bb3dc4e70657f205f8fe3fc16babd15be1f1513be5d3ee975f310e1185f3"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 163600, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, false, "61a63b095628bf495f7ee69c9a1f86751541cb69b44255ade95370a345ed27c9"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 163680, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, false, "760135f74fe808df8284a14e4aea105aaa0df6846513a4533b71b218ab603e38"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 161456, 512, 2, 32, 1, 2, 0, 0, true, false, false, false, false, false, "44b303f8101be4d5e76fd932124210d5c592a4a00763d31c7a942e9142b65686"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen", 162720, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, false, "7db9e50f25a08d7e159c6b5e73824fdd928d7ca96a5cfbbabe2846498353b6ce"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen", 162800, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, false, "5b2900cd4598136a93dcc13dfc96dadb93511878c182a5aee5dea3a6148d4bac"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen", 161600, 512, 2, 32, 1, 2, 0, 0, true, false, false, false, false, false, "9654927184de874a54076ed6eaebb058d41567aafd1332631019303d2a77a4fe"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen", 179648, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, true, "0818ceefcc7f8e9a4500f239faefc2225cf5c1c8de4ec5e741ded9d880373347"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 175376, 384, 2, 32, 1, 2, 0, 0, true, false, false, false, false, true, "bf00541f490b9fdf4c8d68bd4da8b10d9783263df8308aa1a6722b963674eca1"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 64, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64PersistentSwapsAbForGen", 177744, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, true, "6803e7a100a461f296887c112084c5c63a98d342a08dd0c21c4c1e9811b03ad0"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 64, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen", 175520, 384, 2, 32, 1, 2, 0, 0, true, false, false, false, false, true, "0e70d50ee9579fa0cdb3b1d6500ba876e271b2b127151669ccc6706c98a34537"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128Persistent2CtaKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128Persistent2CtaKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128Persistent2CtaKeepsAbForGen", 207624, 384, 2, 32, 1, 3, 1, 0, true, false, false, true, false, true, "895e6d3ed5e0ad529eacbddc870fb4f1ef055e9d340a9d34b43e952cbbf5257f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen", 214352, 384, 2, 32, 1, 3, 1, 0, true, false, false, false, false, true, "6fb98d818e8908cd742d720020c8dabf89af34f7e710fb42c89756ef2fd3d9ef"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128Static2CtaKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128Static2CtaKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128Static2CtaKeepsAbForGen", 207448, 384, 2, 32, 1, 3, 0, 0, true, false, false, true, false, true, "122b7ffaac7c32695a9ad5bd41d4f12bd56caa0ba862510690995895b24878d5"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen", 214176, 384, 2, 32, 1, 3, 0, 0, true, false, false, false, false, true, "bbfc05b685f992327dff6f90765d2c055e7f86d43fa46312db777b7830f7734a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen", 164672, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, true, "7b9de3f0c8b3057f8b128b06f4bb4b24cb36d9ce9d1f317d34ff8cad10a989ec"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 162448, 384, 2, 32, 1, 2, 0, 0, true, false, false, false, false, true, "b25b9e48f8f1d925ba647d0c2a8c75ccdae537ab38d67acaded42bfd7a61fd67"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 64, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64PersistentSwapsAbForGen", 163792, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, true, "d56dc244951560d8a273b8a47794a579dea58337a0dcdb8dd3be138a48f81e01"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 64, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen", 162592, 384, 2, 32, 1, 2, 0, 0, true, false, false, false, false, true, "1f689f704c61692fba34c1ef5323272bdbf38909924167fadb61de32d985e352"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 212648, 512, 2, 1, 1, 2, 0, 3, true, false, false, false, true, false, "c888ffe93c1587240f0d419b1fa5dc46ea32c05a5877af983a3969d29f1c4444"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 199976, 512, 2, 1, 1, 2, 0, 3, true, false, false, false, true, false, "bf0b9091df5834638799f308ce71e82c570b6b217ff3fdea771b7acb879be108"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen", 212824, 512, 2, 1, 1, 3, 0, 2, true, false, false, true, true, false, "1920ce7a0b986c20f6a2960bdd3ccda7a55480c70fe790d712b278295f8ba8b2"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 179360, 512, 2, 1, 1, 2, 0, 1, true, false, false, false, true, false, "4faa7ab26891475a3307d5760fb8baebc977d852d20696b5d8793e026c48cb89"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 166688, 512, 2, 1, 1, 2, 0, 1, true, false, false, false, true, false, "2f2f21f5500fcfc80d14692661a58dff60377453f36398194795e2d8a1073891"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen", 183552, 512, 2, 1, 1, 2, 1, 0, true, false, false, false, true, false, "1ad0176964745b1b8248d9a736db7199db53929c086708b574dd2fcb3a609a5c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen", 183632, 512, 2, 1, 1, 2, 1, 0, true, false, false, false, true, false, "3c0c3c661be27227e5588f9386e47de59816b99e389597e63c34fd18a8e271c8"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen", 179360, 512, 2, 1, 1, 2, 0, 0, true, false, false, false, true, false, "1d9778ce9322dc9da55df2ce2f37ad928918068e95278eb1c3ad50c86bf5512e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Persistent2CtaKeepsAbForGen", 212904, 512, 2, 1, 1, 3, 1, 0, true, false, false, true, true, false, "8c721ecdcd2b2015098af6b2c5af0c210f6264db42a7d6c4c6769efa24ca24dd"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Persistent2CtaKeepsAbForGen", 212984, 512, 2, 1, 1, 3, 1, 0, true, false, false, true, true, false, "f51d83718014961714354202c04d3264ea49779bde7ded9bae4a05724ceae51f"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Static2CtaKeepsAbForGen", 212808, 512, 2, 1, 1, 3, 0, 0, true, false, false, true, true, false, "d3e45ea7263410ecb344469f5779582be3dc9efbbf9fd3a214d3c08a8de409cd"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen", 168832, 512, 2, 1, 1, 2, 1, 0, true, false, false, false, true, false, "499b3543c101ff9265776868bfd381bc939a7bcef91be4a9f1bcfd3dbd27ea14"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen", 168912, 512, 2, 1, 1, 2, 1, 0, true, false, false, false, true, false, "366730c9c681bd5bbf394c7b9919306ef84ba31ad96ba3a5e4785429742340a7"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen", 166688, 512, 2, 1, 1, 2, 0, 0, true, false, false, false, true, false, "1e308fb975542ffb07c96ea605180894afb0c2de8c262f102ba659d0c2a72007"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 207288, 512, 2, 32, 1, 2, 0, 3, true, false, false, false, false, false, "2a1ec0c869feecbdf8c42aaaf49ebd528683bde8dfebe2f4ca2925f3183cc817"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen", 207432, 512, 2, 32, 1, 2, 0, 3, true, false, false, false, false, false, "d0de9affa98a5a34a65130597f25f79ea29d835b2a89e72b7927df9f4ba933c4"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 194488, 512, 2, 32, 1, 2, 0, 3, true, false, false, false, false, false, "ff774eebb78447da9e6b005a4a64fcecb665c12f917a83ca080384ec6606293b"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen", 194632, 512, 2, 32, 1, 2, 0, 3, true, false, false, false, false, false, "56b279042435f999baf65776b6e0220ecaca3e2c50bdc52afe9ef1b1117a96e9"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 208408, 384, 2, 32, 1, 2, 0, 3, true, false, false, false, false, true, "8a6e8d9c9de5658c9459c7fa4a92f7197169b20a146a7cd9628cf0d4c13e7ebc"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 64, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen", 208552, 384, 2, 32, 1, 2, 0, 3, true, false, false, false, false, true, "bc8aebda5e52a77607a9c1c85af2578a00b9f6544159bf4730643e4dbb476174"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 195480, 384, 2, 32, 1, 2, 0, 3, true, false, false, false, false, true, "afe652ffc264237e7945515f7104fbf408a12cf7b65df69dbbffc2e278e7a478"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 64, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen", 195624, 384, 2, 32, 1, 2, 0, 3, true, false, false, false, false, true, "60287a04786702229bf17b481ed5530dc9af9b9b61c421e64b42c7d1a4306c77"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen", 214176, 384, 2, 32, 1, 3, 0, 2, true, false, false, false, false, false, "77adee886326a57628acfe04beaa5febb1f2d694e76022182c293a50b280c542"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvGmemSepVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvGmemSepVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvGmemSepVarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen", 214192, 384, 2, 32, 1, 3, 0, 2, true, false, false, false, false, true, "4969646ca6708bfded6dd3d9f47eb9ef2011338acf918eee4404512360599461"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 174256, 512, 2, 32, 1, 2, 0, 1, true, false, false, false, false, false, "b1bf2303400a47ffb7a0f016ce6606ffc4789ec96452b81edc9751e8364a5391"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen", 174400, 512, 2, 32, 1, 2, 0, 1, true, false, false, false, false, false, "0c5d4d09a21fe601c218a4205f0f65854d137949d31065a21624ad9b09b54de1"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 161456, 512, 2, 32, 1, 2, 0, 1, true, false, false, false, false, false, "92bef65d19abfe6ec244cd5da41e819c9590a991ba3f5cf6742d79baae512f5b"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen", 161600, 512, 2, 32, 1, 2, 0, 1, true, false, false, false, false, false, "fb3cf24f08c1f7be12b0f33912f28faa09d03ab96c683383d9b1fb0141e7b607"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 178448, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, false, "09600ef632a5e10a6870d7d8e2571fc30b83d61edab667f7b624f7e57c7f74bc"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 175376, 384, 2, 32, 1, 2, 0, 1, true, false, false, false, false, true, "cb37d708ac9289e7d360bfbc2e3f9f7caac04bae46dc56501229629bc6ab755d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 64, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen", 175520, 384, 2, 32, 1, 2, 0, 1, true, false, false, false, false, true, "feedbc0e6c06ad1fd144d2bb58f8cb595d45b7ab88ec2a884649da769f7eee2a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 162448, 384, 2, 32, 1, 2, 0, 1, true, false, false, false, false, true, "d00e81fada174bf581690a34742838d064a2d37e24ecec44fd659813b05d1b87"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 64, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen", 162592, 384, 2, 32, 1, 2, 0, 1, true, false, false, false, false, true, "d63f80a482740cba0da7079f691facceeb7fd3bbcc13f94516f4f7cf18723f66"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 178528, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, false, "a598818f989ef67c928b8792dc5b9ef4ec268c78f50f4d5af927a9193d11fb11"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 174256, 512, 2, 32, 1, 2, 0, 0, true, false, false, false, false, false, "7c165f3c3a2dbdbc0434599e86648160627218999a290a5486f16675fa76eea6"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen", 176544, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, false, "e796f1e275bbc64633af2f2b52517aa451fc325abcf93785ab99c3138686e703"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen", 176624, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, false, "7ac1c6e945d7cbe96c1d21fa9992d7388cee066b00fcdafa9deb12da37d63090"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen", 174400, 512, 2, 32, 1, 2, 0, 0, true, false, false, false, false, false, "811ff6255f84f00745900f4a3352d9aeb447908cc0f38f35a1b400fa8cad79d1"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen", 214256, 384, 2, 32, 1, 3, 1, 0, true, false, false, false, false, false, "2b8a437b4b80069951be4de9999fc800452cb7eeac20062b55bd9725e4a8af61"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen", 214336, 384, 2, 32, 1, 3, 1, 0, true, false, false, false, false, false, "e0c3f1f3ecb7b8e9ffd1d681ef3e76166ef18108de7a8a56a41450a74fe74c3f"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen", 214160, 384, 2, 32, 1, 3, 0, 0, true, false, false, false, false, false, "e5a2bc343df571b961f3a399e9a22d0c190c54af2a0bdd397fa8637a80b394a4"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 163600, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, false, "70625280e579dce390af8d1d87dfed0d6a2bb7ea1aef66c9fdec95908773c5ec"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 163680, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, false, "76a8f0c067e370200c5d05fcff041fb605d22754b398f7d8c2a4375bc5e873b2"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 161456, 512, 2, 32, 1, 2, 0, 0, true, false, false, false, false, false, "5b551c3686416d274d6f796a8dcf9da24699e0475fea19f214131a23c3b4ec99"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen", 162720, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, false, "d08780cdd62d9d5305b4871a04a700bb3bae851edd4b176367532cf63e39ec6f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen", 162800, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, false, "951ac0157b0d5275db32531c71049b7471fcabbdfcc3e82e1574a38f86d78481"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen", 161600, 512, 2, 32, 1, 2, 0, 0, true, false, false, false, false, false, "a26ec003e14e0a936aa346a01a8cb5e049caa033196b5d44e54c630fa27910cd"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen", 179648, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, true, "611925eac7b58eaad25d87e8bc4b532442597141e9a68f5340eab3606d69ead5"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 175376, 384, 2, 32, 1, 2, 0, 0, true, false, false, false, false, true, "1ef422ee3588b97dfb0b4d5b7726f3ced24afe5727c356411fdf6803e1337aa1"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 64, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64PersistentSwapsAbForGen", 177744, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, true, "045dae7d3a4e7b3c419e17d4858ad870ad57ac3e483aa2f6013f49eabb96c2e0"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 64, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv64StaticSwapsAbForGen", 175520, 384, 2, 32, 1, 2, 0, 0, true, false, false, false, false, true, "478226442f7c3c3ba4164c327db45d25747dc2efceb10243be4a1598a60f26d7"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128PersistentKeepsAbForGen", 214352, 384, 2, 32, 1, 3, 1, 0, true, false, false, false, false, true, "2209842cddce8604250adef56d864245106866a6dcf2b53001dd845467129847"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ64Kv128StaticKeepsAbForGen", 214176, 384, 2, 32, 1, 3, 0, 0, true, false, false, false, false, true, "26f0143c377f10f9b05b583c10e0e387580d9044f3ef519bf81a228f1ff58f43"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen", 164672, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, true, "71559ef06d8bedaf98c1cac2873a123e2d9ad573cab0f6929cc864d8085e7f25"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 162448, 384, 2, 32, 1, 2, 0, 0, true, false, false, false, false, true, "55af1a0a5d5d78e75520fa6da50cf687f8265d5ee713acbbb13b3b037394896d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 64, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64PersistentSwapsAbForGen", 163792, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, true, "49b951e4333b61ee48fffe26c76cd24b4774938fe37413747b5c3f7796fa3488"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 64, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv64StaticSwapsAbForGen", 162592, 384, 2, 32, 1, 2, 0, 0, true, false, false, false, false, true, "c5d699fb60ad1f25115ec2bc06c980534745aa32443f87b522b29ec6be311bce"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 212392, 512, 2, 1, 1, 2, 0, 3, true, false, false, false, true, false, "5d635f5fd4a5686c6ecf145fd34ba5532f8d83c532be20f4e86c6cc04c044a05"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 199720, 512, 2, 1, 1, 2, 0, 3, true, false, false, false, true, false, "48503cb46877901c2331529bc161aeb12edd7a3d5ca516db3b65922c530a4e2d"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 179360, 512, 2, 1, 1, 2, 0, 1, true, false, false, false, true, false, "01adedb565a36741976891cf33671016fd7e902a1eb457a22e0dfe5abf472459"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 166688, 512, 2, 1, 1, 2, 0, 1, true, false, false, false, true, false, "c5f0ff2472af04dee29be4cde807e034e6732c24f3b92abe9153539e3b518ac9"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen", 183552, 512, 2, 1, 1, 2, 1, 0, true, false, false, false, true, false, "98cef6885048f6adc32e1744a1d89bf23e23aad08dd0bc0535f38cfa859a9957"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen", 183632, 512, 2, 1, 1, 2, 1, 0, true, false, false, false, true, false, "370ae4e3758fd0b3463a2195072d5df8252279b3456e93d3cfcf97325b9965e8"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen", 179360, 512, 2, 1, 1, 2, 0, 0, true, false, false, false, true, false, "23ddb6d963c91b890b35696373b28195f35cd440885f2f89481b3d95e183dbc8"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen", 168832, 512, 2, 1, 1, 2, 1, 0, true, false, false, false, true, false, "9ea4239244a8a2b4a78dc71c69e9a14bc61d112859258c7e7dd6589f05e1ae11"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen", 168912, 512, 2, 1, 1, 2, 1, 0, true, false, false, false, true, false, "ef33eca4943f76b55b566d290ca0f462df0f2855a4da32dfb953b81c92a7a8de"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen", 166688, 512, 2, 1, 1, 2, 0, 0, true, false, false, false, true, false, "812cdff4732e53370accc121efffcbdc33f99479c771a4b781e13c5afe7bc33f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 1, 0, 1, 0, false, false, false, false, false, false, "9f1f1b5f150f55ba0acb2b11bb860c3546aae91af47676bc3e614aaec27dfa95"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqQ128Kv128PersistentContext", 82336, 512, 1, 0, 1, 0, 1, 0, false, false, false, false, false, false, "2e1289dd30eef36d8dd30007acc98683f7d753e1aad093208853aa0f399cf226"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 1, 0, 0, 0, false, false, false, false, false, false, "72f6d025501ccc94175e2d9c12f9e22471dd557e810e68ed23f88dc4d0b64930"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 82272, 512, 1, 0, 1, 0, 1, 0, false, false, false, false, false, true, "19b78bf893158a64e9a77d99f34975bef4f0f4e692db6f69c6d3845fe1296960"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 82352, 512, 1, 0, 1, 0, 1, 0, false, false, false, false, false, true, "cb085ac15b67bf5659016335582e7277340afd4c13c42b3d65022261a685ccef"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 82176, 512, 1, 0, 1, 0, 0, 0, false, false, false, false, false, true, "52955de13bcb8d0ceca1c0e0b8fb99aa5f300d1b39b984dc1f3cae8620e5bc11"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 0, 0, 1, 0, false, false, false, false, false, false, "1b7f50045edca42afd8d92c0b644994ab893e3daec607cbbd91bb0ab3d55c0ab"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqQ128Kv128PersistentContext", 82336, 512, 1, 0, 0, 0, 1, 0, false, false, false, false, false, false, "3cd31cfc8725dd3b340a9d69ec031423e6268079c880f3ee689e5dc2e89d35b8"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 0, 0, 0, 0, false, false, false, false, false, false, "f553b1810cc13809eb73e52832e3838b139866364f8d989f0ef02f8f89e99579"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 82272, 512, 1, 0, 0, 0, 1, 0, false, false, false, false, false, true, "9d6f606b8d5046767d3ca5ff6cb99a8bf04a64d5261e9ef2ac9b43a68218aa66"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 82352, 512, 1, 0, 0, 0, 1, 0, false, false, false, false, false, true, "3e85366a3510229b75dcf00808f9ee5058c8055f3a6d436335cb43b7eba8c6f6"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext", 82176, 512, 1, 0, 0, 0, 0, 0, false, false, false, false, false, true, "746b3b9bf8ade66175278897f20fb26d52d9655f15424febe3cc525a138d9ac4"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 2, 0, 1, 0, false, false, false, false, false, false, "efeaf4600eb576047c662ee2eb32c372be0eae77b5fa5ccd65c7e604c056954b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 82336, 512, 1, 0, 2, 0, 1, 0, false, false, false, false, false, false, "c0e73baa5c9f6b1daae661a9ed990a5cca8e58d3c66909f5eb8f034b18ab65f0"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 2, 0, 0, 0, false, false, false, false, false, false, "c9bdd971ce144d1bf0ae57b4c940336f2dd9274ab52048d074905501351ee2c6"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 82272, 512, 1, 0, 2, 0, 1, 0, false, false, false, false, false, true, "dad3653a2f66cfe5465d0a3ca989df01f587d0f942137da6fb388b9f1345e4a0"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 82352, 512, 1, 0, 2, 0, 1, 0, false, false, false, false, false, true, "e1f0a1e2bbc2332c85f47141eace9ca19bff7cd0d027353bf9b6119fcfb7ae7d"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 82176, 512, 1, 0, 2, 0, 0, 0, false, false, false, false, false, true, "4562113a43d08d7479d6d34e4e17db688828fcf95a8ceb4d77752d8aace82301"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, false, "97b8aa79799105b3c6752f46ded85b8eb09f3b9bfba6b598ffad7fb565847f95"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 83200, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, false, "dee1ac998ae01619c44ef8b82c75dafe37fecb941068f7be0404a9c82bb25803"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 1, 0, 0, 0, false, false, false, false, false, false, "582874ecc102602634cab997506cca71bf3186bc1efe81fc58f4ca965a3f23a0"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 83136, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, true, "1ceab5208b607efc65890ef86da8f8c33390ecce395866ca7e3c3391c1c9d91b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 83216, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, true, "8013c672f3e6d3e38ec42a5e5d8662aad4a0437e9f3b04822b619fd202a0273f"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 83040, 512, 2, 32, 1, 0, 0, 0, false, false, false, false, false, true, "78b258f0f01a78a9af6f38838fa95a036051bf0fc52ea9be64cd1e861a2db285"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 191672, 512, 2, 32, 1, 2, 0, 3, true, false, false, false, false, false, "0d26903e0ef5355649fc3175ee77bb18ce1203ed9f95fddfd0066708d4535163"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187064, 512, 2, 32, 1, 2, 0, 3, true, false, false, false, false, false, "ec99f909126fb919233d786379e890943bf907dc164eae3f35e95b244e08774a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 192792, 384, 2, 32, 1, 2, 0, 3, true, false, false, false, false, true, "11ca7c34359ac69db6449d03a2633440a5f2cd57b02eb81ee46e2c34832df558"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 188056, 384, 2, 32, 1, 2, 0, 3, true, false, false, false, false, true, "62f5db41a57664e83cc1df195047425f2ecd9ade2ec060a0ef4abd59c1a62b4c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187064, 512, 2, 32, 1, 2, 0, 3, true, false, false, false, false, false, "94c13578c7aa44649e63e81d5c8801ba8c1d3591aebc02813355867900ffa631"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 192792, 384, 2, 32, 1, 2, 0, 3, true, false, false, false, false, true, "f35f1655a70b132ba7ab838b472a7cd6d1737469b92f41f46e49929110e7e8c4"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 188056, 384, 2, 32, 1, 2, 0, 3, true, false, false, false, false, true, "23ca8a92de71abee20ac913a8d37cb2f9bcb371839236e24561fc79014688b71"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 32, 1, 2, 0, 1, true, false, false, false, false, false, "aa9c478bda044e476e7cf3507f18a17a3dbbc3f52d7cce786f3b68e8245b8147"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 32, 1, 2, 0, 1, true, false, false, false, false, false, "21bda4b3e3c4196277b7786c6df8e4aad6c309a631fe0335db7b5c61016014ba"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 158992, 384, 2, 32, 1, 2, 0, 1, true, false, false, false, false, true, "d974186f251f6ee9ea87709d132264987b5c1efe5d7ff1902bb2fa62bb263eb5"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 154256, 384, 2, 32, 1, 2, 0, 1, true, false, false, false, false, true, "04bd88dbb18cd359f529385068444ca88f930eea0b45aeb401cf8c178413d475"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, false, "6c845a3cc3526cd7dbd4f4d77a34a5a7f072c2ef41ae9a9114d45a2a1cf0a654"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 158992, 384, 2, 32, 1, 2, 0, 1, true, false, false, false, false, true, "bbeaf4d7549aaf4b51cebff7d4404c2879399058f034551e49d8460b76f62afa"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 154256, 384, 2, 32, 1, 2, 0, 1, true, false, false, false, false, true, "20d1a09eb61583b2ed50b7836d639fa472bb2259357f4a033d08fd529e07cf48"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 83200, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, false, "fce59e0b9d9118a035b344474cb013e8e5fdcc7ae7b8275815c0f4bd7ae3da2b"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 0, 0, 0, 0, false, false, false, false, false, false, "f3ad1dc6e7dc6a0ad650592674231796be7da85c757bd477a609d2e4a54632cc"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 162064, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, false, "b5cd01426f393dfd7f1c5c1e8eacca6e2dda03f1df9afe16e81ac5c552c39c2e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 162144, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, false, "83438f208c967191e0312a86bc24d7ee5133312d7e3b0e9c8c42abca1a6fc429"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 32, 1, 2, 0, 0, true, false, false, false, false, false, "433269f93e11301fa34cf219555c8ace4fe54b5a340b491f25493e67218555bd"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 155408, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, false, "3ce75f9067adc02b2813e69295280f8173e17ad88dcc5ee89036458d00331ab3"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 155488, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, false, "f919c9eaa95e9e198a9f9ebffe4bcb5f409fc9b6a43db616857ba6e8a993a301"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 32, 1, 2, 0, 0, true, false, false, false, false, false, "3e1de78aaaf55aeae17663691185f987bfa25545437225613064be3f5390442e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 83136, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, true, "94d8b0cc4c00a6f75258a9935205ff6fe7516849920028d0cc7982845d308186"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 83216, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, true, "7175d7273e13ad8735272dd78183dbb9dd3813d2f210860f30d40b53633f51a6"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 83040, 512, 2, 32, 0, 0, 0, 0, false, false, false, false, false, true, "67e864927ffc7e3e2c67077c249716d4d78a07e99091261b10cf7e2e74a7b145"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen", 163184, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, true, "033b11aa322524fa1976274dce8c8f0f902f52d7e3b228e0ed3553fd523709ba"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 158992, 384, 2, 32, 1, 2, 0, 0, true, false, false, false, false, true, "4c8b4699570a2099ac2949f0dbe5e013a8fad1c3865ee5b27045322ec46f1b3b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen", 156400, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, true, "674c561815b9a2c956bd9876379cebc8452dbf981365112a4f83c580795f2157"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 154256, 384, 2, 32, 1, 2, 0, 0, true, false, false, false, false, true, "720e1f72cc088b70b32a0132858d2aba47301efcb4a12a062b5d63ec99479c68"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen", 163264, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, true, "4aa6189acdec41cf47f37145b7fa1ecd0064ebf14efcf97197704e5cd613f8f4"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 158992, 384, 2, 32, 1, 2, 0, 0, true, false, false, false, false, true, "d86e90b012568b663bfae26edb35d4fb2241d949fe888b5ad236f476ffe2891c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen", 156480, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, true, "d5aec70a8b528068e02b92e2e85fd507a61deb5216ef59940c432fb1841cbac0"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 154256, 384, 2, 32, 1, 2, 0, 0, true, false, false, false, false, true, "25cd773185d49a426288793776266b2501bdf87b70a5417103bc5a787b3f3c8d"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 191672, 512, 2, 32, 2, 2, 0, 3, true, false, false, false, false, false, "7859ea47b5f8087927e20c302048eb973011c22998951e069dc26e77e2bb294c"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187064, 512, 2, 32, 2, 2, 0, 3, true, false, false, false, false, false, "5f7b0d32a74352959f8e22df4d7764b0dff0d2daae12d8987817758ac4c17ccc"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 192792, 384, 2, 32, 2, 2, 0, 3, true, false, false, false, false, true, "dc7b458f1ee59ec25329aac075e72e0ccc822c6961341c13c7ede3bb0a9fda8f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 188056, 384, 2, 32, 2, 2, 0, 3, true, false, false, false, false, true, "bb11fde66562f2a014bda58682c323f4b7d7acc87e8e94f6bedfafcc3a2eb7ab"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 192792, 384, 2, 32, 2, 2, 0, 3, true, false, false, false, false, true, "0e740ca9736f74da002c0cb76d861c93521977d88ad84028018617b2dbdff808"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 188056, 384, 2, 32, 2, 2, 0, 3, true, false, false, false, false, true, "7acae625c6440d90dc9c8c9717462433d8312be2ee51cf258757c02f42c334c8"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 32, 2, 2, 0, 1, true, false, false, false, false, false, "3ee2a005d3f0061a7ff958b14a8cb947cd32b686bb23abbb0264688edad895bc"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 32, 2, 2, 0, 1, true, false, false, false, false, false, "dca00fc078629789b6e90c78c41a81364b2eabd021985df933380433f11628fc"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 158992, 384, 2, 32, 2, 2, 0, 1, true, false, false, false, false, true, "e2048bbdfe8812b8e550cbc6758d47d31efdbeee756da96e9d11d36a3c2bd38d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 154256, 384, 2, 32, 2, 2, 0, 1, true, false, false, false, false, true, "0b678a6ee48521d562537766d0c94785cfc4cd8da04e0cf91f527b6aa562011f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 2, 0, 1, 0, false, false, false, false, false, false, "1added6f940248d9064277479053eea7b89b9e40a0ebe08b6e2bd009617f3a96"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 158992, 384, 2, 32, 2, 2, 0, 1, true, false, false, false, false, true, "ff40a631af17b66199a0245e6bc073b832fb2494a04968efbf15fbe2eb5e4244"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 154256, 384, 2, 32, 2, 2, 0, 1, true, false, false, false, false, true, "b96c9ef020af267fcff20a7e5c3742487b40f66e7020840186a83d92988cb0a8"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 83200, 512, 2, 32, 2, 0, 1, 0, false, false, false, false, false, false, "9eefa3b81dd33b7e82169ffbc901f292af8896c8fa056239970d86f33c1ad5b1"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 2, 0, 0, 0, false, false, false, false, false, false, "e09d7361810f17725abdfc70f30b6842a06121cb4fd82539a7d6c3ae9482e9b1"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 162064, 512, 2, 32, 2, 2, 1, 0, true, false, false, false, false, false, "fc0152d639ab1fad1ef83ce5881aba6ba39dcddbd87af9f7a09b6932bf62e20b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 162144, 512, 2, 32, 2, 2, 1, 0, true, false, false, false, false, false, "f82d7e470360f905ce26694cdd58f7fbe97c75d6dd5331fe16cc00bc0d535266"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 32, 2, 2, 0, 0, true, false, false, false, false, false, "935c0362ad257fd732a5f3fb7b64b09ee064e94c680f1128e2b8826e3c02b035"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 155408, 512, 2, 32, 2, 2, 1, 0, true, false, false, false, false, false, "3f8ab8313167e9731c80d1697a87c89f09025131bc4c88150bc04561cd75b8dc"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 155488, 512, 2, 32, 2, 2, 1, 0, true, false, false, false, false, false, "e27d87b04bc23c4a636fa4174903578eb1909541dcbbfbcf7bf681fd55c23871"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 32, 2, 2, 0, 0, true, false, false, false, false, false, "900b50cc8b9bb3050deacd151b0b556b7c1ae73c02c58159154ad42cbb6b427f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 83136, 512, 2, 32, 2, 0, 1, 0, false, false, false, false, false, true, "ebdcac7f22301b2ab124957e35de90ab8997adf2730f2f2401b63aeb577a2cea"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 83216, 512, 2, 32, 2, 0, 1, 0, false, false, false, false, false, true, "cd7263d82e5e662546fadafafc959b3db6e464c2442ef8884db59308a4411e24"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 83040, 512, 2, 32, 2, 0, 0, 0, false, false, false, false, false, true, "8f8d46e2d3e5c700c7defc9521cba71df19fff5d783c35d163a3901f376ec2ec"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen", 163184, 512, 2, 32, 2, 2, 1, 0, true, false, false, false, false, true, "8ba6eabc1c04a262d0084033494cbbee46cf34dfff97556f59df36d20c4feb01"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 158992, 384, 2, 32, 2, 2, 0, 0, true, false, false, false, false, true, "c6f69a4297db1edb897237a0ca41d2648d6f736df3de74127843f6b54082483e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen", 156400, 512, 2, 32, 2, 2, 1, 0, true, false, false, false, false, true, "de49a587cac350d6617bb8425d0d12c71d0cbd4f02b285cfa616c0c30e6eabe1"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 154256, 384, 2, 32, 2, 2, 0, 0, true, false, false, false, false, true, "66db62b117dcce03a316363ebcb9450e34549bcea2f5a4ecdc8359a749a4105d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqQ128Kv128PersistentContext", 213408, 384, 1, 0, 1, 0, 1, 0, false, false, false, false, false, false, "f1246e6d30b24fce1e28d1eb465d029eeb5221d0275b1e14699bbb54dbba618c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen", 163264, 512, 2, 32, 2, 2, 1, 0, true, false, false, false, false, true, "1338227c038cdbfe8e4c559454559ebdb82549c2930e4a548f8640d77496fd13"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 158992, 384, 2, 32, 2, 2, 0, 0, true, false, false, false, false, true, "37e6a72472b2228159fdf5e4fcb5125409d960f4fc25328f7f5e1cedae88f8cd"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen", 156480, 512, 2, 32, 2, 2, 1, 0, true, false, false, false, false, true, "569f29f87f20772a12c699743286de61792f6cf4400ecdbf1b5f96c32bafed28"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 154256, 384, 2, 32, 2, 2, 0, 0, true, false, false, false, false, true, "502878be64e8efc1d050782c994f4e831a17c36c1e9b10c573a27c191ed079fe"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqQ128Kv128PersistentContext", 213488, 384, 1, 0, 1, 0, 1, 0, false, false, false, false, false, false, "ac6efa5a506da4c884a0b630a5a7ffb95f41f708433d02ce1b9909f187962411"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 1, 0, 0, 0, false, false, false, false, false, false, "18ab68026d7e08918a0184f9ee2933dec7bd2e987b22785c3ee169f5e1980093"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqQ128Kv128PersistentContext", 213408, 384, 1, 0, 0, 0, 1, 0, false, false, false, false, false, false, "3f251b3439e941c1c147d6fe290b54f80826231df34ec9bd3403abe0aff5ede8"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 213504, 384, 1, 0, 1, 0, 1, 0, false, false, false, false, false, true, "4535784428c4a4b07bf27b4337eb11f3baba5b145c74c5d201f192af3aec9414"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 213328, 384, 1, 0, 1, 0, 0, 0, false, false, false, false, false, true, "16b76e1e26bf55d0c17f4486291fcfb3ce1d167c705a2b7fdab6210de81895ce"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqQ128Kv128PersistentContext", 213488, 384, 1, 0, 0, 0, 1, 0, false, false, false, false, false, false, "059225707e45247e0530cc093e569c627a17272de7501d7e22e04135abe0d599"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 0, 0, 0, 0, false, false, false, false, false, false, "354ee44478aaa286ac4f2b505a8be10c509d7ee36d8049dfd647429a8475a571"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 213408, 384, 1, 0, 2, 0, 1, 0, false, false, false, false, false, false, "3b9d72015d96e4895ff36bcac06edcf38fd651541a335c85413b56c5b8c87add"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 213504, 384, 1, 0, 0, 0, 1, 0, false, false, false, false, false, true, "f758a3659033a0a349637597153716e6ffcafb6f9266dd63d7da722cd0a00f21"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext", 213328, 384, 1, 0, 0, 0, 0, 0, false, false, false, false, false, true, "4fbadb52070102941a0369176856823de4a56d8acec7fb61ed3cbde78578df01"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 213488, 384, 1, 0, 2, 0, 1, 0, false, false, false, false, false, false, "7e175c8f547ca05bda6d404b5507c349d7ab6162e361a64732cee76d39bf0738"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 2, 0, 0, 0, false, false, false, false, false, false, "4f6ddb2b7b08ca52b49c294820a9e7fca4e316d06395ffdbfd93f163a127edb1"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 32, 1, 0, 1, 0, false, false, false, false, false, false, "8f5e78068f44a0718e361a7de0b855fb6173e914c67b5bfec1ccba0d28e61123"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 213504, 384, 1, 0, 2, 0, 1, 0, false, false, false, false, false, true, "e94e80afca1b5b59fefafac47cdb5d5233f717911beae9fbc52268924df72e20"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 213328, 384, 1, 0, 2, 0, 0, 0, false, false, false, false, false, true, "20db6138bac3228072e7d558608295b23ab4fd20d16b5b68c03decc717e015f8"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 214352, 384, 2, 32, 1, 0, 1, 0, false, false, false, false, false, false, "fed86eb5c3e348866f5feabf68d2716fb3dde676dc18b9162382d09d7e17e1b3"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 1, 0, 0, 0, false, false, false, false, false, false, "d926252d9c14d832f0987db9ddd3e90a3d50a40fce10d0461464383ed25ad4d2"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 214368, 384, 2, 32, 1, 0, 1, 0, false, false, false, false, false, true, "f85df78f37b327bedfafd5cd5b4b20c5fa710700c58f42f8ad2e46780b860f8a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 214192, 384, 2, 32, 1, 0, 0, 0, false, false, false, false, false, true, "6ca85138cf318415848ad1e91a3e43a1e11c3a090b0535baf69caab44607105e"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 195256, 512, 2, 32, 1, 2, 0, 3, true, false, false, false, false, false, "95e184a3ffa1dd1f10f4681a3256586b3dc62d36f6e396f479ad5557ed0cd4b2"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 188600, 512, 2, 32, 1, 2, 0, 3, true, false, false, false, false, false, "2e9f6a1470b15ff537e376927409048d790b5ca92fb6efcab3eb9c2e2e8973c0"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 196376, 384, 2, 32, 1, 2, 0, 3, true, false, false, false, false, true, "718524225c7b5d18de5079bc24acfbee942e06141f333661fbfbc0c5070f13b1"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 189592, 384, 2, 32, 1, 2, 0, 3, true, false, false, false, false, true, "7ae99c1c162b0af7eea4586287154e5fd638791740aa687cedfa5311d563a42f"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 32, 1, 2, 0, 1, true, false, false, false, false, false, "9c59d904033efef23d250a7133c98248de5eea70ca9c7c3f15a3022b37c304a0"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 32, 1, 2, 0, 1, true, false, false, false, false, false, "f8606ff38c70aa75f9cc413f1e7428d2ea64b6308a6b784172866ff1fa82b246"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 32, 0, 0, 1, 0, false, false, false, false, false, false, "e3a31c6d8a0f6ff5a76a70aec4982014ad582f30eb9690d97033b1e708ad5f56"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 163088, 384, 2, 32, 1, 2, 0, 1, true, false, false, false, false, true, "df38879e1df29280dcead7525714f5bf3f0c646a0c29b80aceac5d6d13cae2b1"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 156304, 384, 2, 32, 1, 2, 0, 1, true, false, false, false, false, true, "ba3da7fe8757475417c45bb33d2f9044f6c60ff4fc047e2d00cdc9dd2d8ca3c0"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 214352, 384, 2, 32, 0, 0, 1, 0, false, false, false, false, false, false, "98aa9c308da903c544875c8908a8d7d6b9b2c11598b496d1ad276b6bd8165e6f"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 0, 0, 0, 0, false, false, false, false, false, false, "88a012b2567c3a34422e466d890a204820914bac301d2dd286f25e83253c00b1"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 166160, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, false, "eeac37091494ea0622d390500042ea20d6e890506bd8273c18dfc7d2f906cb74"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 166240, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, false, "4bcc9c731fa6757f5196c4154dc87846763bed3eb8e7270a9177d911657466de"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 32, 1, 2, 0, 0, true, false, false, false, false, false, "0249306c8ef4c6d2a991c975c4beb29a4ef17e6aa5847a8e78049289dd094090"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 157456, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, false, "635bd63d4e41b0fcaa9771be2d6489a3058b45ba11558697518b3b3573480566"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 157536, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, false, "d2e8f1f0b680dc147f72aa1a64b525ec74ddedb05fc57473d9b553595ab504be"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 32, 1, 2, 0, 0, true, false, false, false, false, false, "c2540b6fbc8df0cec87bd05719229cac71c8f1cea8eb5f6903360f746481808b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 214368, 384, 2, 32, 0, 0, 1, 0, false, false, false, false, false, true, "10a99f913e73c878c76fb79745a43174aa0dc909b01354e1aa741e2147d20be7"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 214192, 384, 2, 32, 0, 0, 0, 0, false, false, false, false, false, true, "ceb7a6ca4e7d34bc28a41881268592edd692c54b4c63d691e6e5b65bd6f70f26"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen", 167360, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, true, "2215793490c1fa4ce525a55f7574bb77e2be492c8e2446c89c1d7e0347648b58"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 163088, 384, 2, 32, 1, 2, 0, 0, true, false, false, false, false, true, "093179a1e166c80db9d5a8d9571f0b6d6dfc8e2e99a1c3fdee3a3b13988deb84"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen", 158528, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, true, "0cfb0ab7e5f4bf1ef5c4065138712894bb26cc6f8fa192103fe141151f6a1c94"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 156304, 384, 2, 32, 1, 2, 0, 0, true, false, false, false, false, true, "1d3b5979f09b813e3109fb3aca55ba3e157bb70f10c031cc0fa0156b64f766de"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 195256, 512, 2, 32, 2, 2, 0, 3, true, false, false, false, false, false, "060c110359e8dbe47ac3ddca883db6d05fe5f7f468a62c152e952a3ccd91f268"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 188600, 512, 2, 32, 2, 2, 0, 3, true, false, false, false, false, false, "d49c38518f2e541a540ca07c0f9fc11fd8c8dd3fc9e6229f1d746ce6558c0f9f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 196376, 384, 2, 32, 2, 2, 0, 3, true, false, false, false, false, true, "fc467a6c6febabd2479de50da5e426e078407a97c213a9f9c88721b07671039e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 189592, 384, 2, 32, 2, 2, 0, 3, true, false, false, false, false, true, "9cadf8895deec7015d0d6a6249d18986715c9da6111d124cecb44a85753119bf"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 32, 2, 2, 0, 1, true, false, false, false, false, false, "009f0b102f66dfb1cf59cbf94d626e5b8e58526178d0282d4054c3cf53216bbd"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 32, 2, 2, 0, 1, true, false, false, false, false, false, "50bb362144b4991032500d0b09b119f7b18cc42c17475548d0fd6915bb41ffcc"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 32, 2, 0, 1, 0, false, false, false, false, false, false, "6c51ce201e69b5865ac05b421d197ecfbcf9095671ff8dfcf5fe105ebff9e861"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 163088, 384, 2, 32, 2, 2, 0, 1, true, false, false, false, false, true, "bc039148dab46ebdb643496b9127b9dae96a17eb276eaca8ef5eebce4e9a9e6d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 156304, 384, 2, 32, 2, 2, 0, 1, true, false, false, false, false, true, "28ab9f9d07f4c7d3cc67decbff45a851282f6f893816b736201b8c689a4afd12"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 214352, 384, 2, 32, 2, 0, 1, 0, false, false, false, false, false, false, "2befabb9e707259664408ad8267849a564b4aee756e180ffe11486c56ad7b1f8"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 2, 0, 0, 0, false, false, false, false, false, false, "568ec6b0602f77648ef9a0e68224627c70d8aa00552eff396ac8bda45a25cc2d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 166160, 512, 2, 32, 2, 2, 1, 0, true, false, false, false, false, false, "3ce4f77e5ca57ae0a765c1bc47f37be8ce91a943778a6ebfeee1154b7eb2e62d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 166240, 512, 2, 32, 2, 2, 1, 0, true, false, false, false, false, false, "0c7361b41595a48942ab7c075fa05b7fc112a42193bba8120c3eb519298cfb39"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 32, 2, 2, 0, 0, true, false, false, false, false, false, "eeb98a36b2459cc342c38474125c4ca1f8f71308f36fea7066bd39d6d7508769"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 157456, 512, 2, 32, 2, 2, 1, 0, true, false, false, false, false, false, "3077ebc2a22dad89e603f2155215f2cf2771cf927c008b098fa2fe3e4c0d5e57"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 157536, 512, 2, 32, 2, 2, 1, 0, true, false, false, false, false, false, "d1ea1bed2c2aa52329208591f54755af33838a1ba168ff28daa3c8737b0b8542"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 32, 2, 2, 0, 0, true, false, false, false, false, false, "2ef333b52d7a1721d5f353b3ad9631e83908ae1b0eb861a270153be0c7c3c34c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqQ128Kv128PersistentContext", 41296, 512, 1, 0, 1, 0, 1, 0, false, false, false, false, false, false, "b2348fc705ba79cc98145db53a9b1342380c4bf26772c0375b8027b0efb2a506"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 214368, 384, 2, 32, 2, 0, 1, 0, false, false, false, false, false, true, "525804b809df1db949502a84581fe206633ca4cecce51d664f768ab182b1076b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 214192, 384, 2, 32, 2, 0, 0, 0, false, false, false, false, false, true, "e5e9678469df47b2a2b3172e8494f56e962215262c41cb0297ddeb38f1b52e07"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen", 167360, 512, 2, 32, 2, 2, 1, 0, true, false, false, false, false, true, "e6bdaad1395b7b0af9c80726c12202a4c8fb384707a278e6c3d03937ac82c5e2"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 163088, 384, 2, 32, 2, 2, 0, 0, true, false, false, false, false, true, "ad1ae4afe9481085f0f69d98c75a3434475d85ff24b5683f627c5a33cd62d386"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen", 158528, 512, 2, 32, 2, 2, 1, 0, true, false, false, false, false, true, "7d9fad3235c6360ed28055216b42956a276166ec2dca8bb0163cdb370df20502"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 156304, 384, 2, 32, 2, 2, 0, 0, true, false, false, false, false, true, "6e8350005618ef1d7c620354a8928f203e8b8cb720982decf7b1a255f94d769c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqQ128Kv128PersistentContext", 41376, 512, 1, 0, 1, 0, 1, 0, false, false, false, false, false, false, "506253d0fdfefe8b6307da082d086e9660a01b547d56e4b628fbdd733cdd14e6"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 1, 0, 0, 0, false, false, false, false, false, false, "5eb879ef9e028e1f28446923e4d443a57e90acf8918a5d0167b7f34ff5464472"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 41312, 512, 1, 0, 1, 0, 1, 0, false, false, false, false, false, true, "a1bf992b1d7c2097859960d0478fdc6819bf3dd5d4e20e6d63965f5c540578a9"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 41392, 512, 1, 0, 1, 0, 1, 0, false, false, false, false, false, true, "232162645c7714583979001c38177c06b195fb5d806040d84f4a4e0a66cbc31d"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 41216, 512, 1, 0, 1, 0, 0, 0, false, false, false, false, false, true, "14f957c25af9189cc18d21f729d427db44d786bc62b36e2c13759f72b2ad66f0"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqQ128Kv128PersistentContext", 41296, 512, 1, 0, 0, 0, 1, 0, false, false, false, false, false, false, "ebeb4739632d42e507dcadc4a44c9b3b7a186efbfc1b6fe6a877b2f012da0daf"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqQ128Kv128PersistentContext", 41376, 512, 1, 0, 0, 0, 1, 0, false, false, false, false, false, false, "8acafdd052ce9f0bc69bd90b77a766c14c27d661b2fed093d4977b8162227553"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 0, 0, 0, 0, false, false, false, false, false, false, "0f7632e18322dabdd639f200b0539c606cec53db0b17c397fa95898c6a77f0e5"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 41312, 512, 1, 0, 0, 0, 1, 0, false, false, false, false, false, true, "f506ae796716643fc41b3fc7eb8260da58eeceb07399c3895b6ee01848251f0b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 41392, 512, 1, 0, 0, 0, 1, 0, false, false, false, false, false, true, "14e7fb9e27c1d9ee41d2c3293a4c3ad4fe71e2c88110d79d858fe615585af375"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext", 41216, 512, 1, 0, 0, 0, 0, 0, false, false, false, false, false, true, "103efd7c5f497115f7c21270e1aedffcc2c67d67de24ada39155505a31c9230e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 41296, 512, 1, 0, 2, 0, 1, 0, false, false, false, false, false, false, "8ae2aa28fac3be4282e09cba84a0e3cac35c0517a7081256ddc120071ced5016"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 41376, 512, 1, 0, 2, 0, 1, 0, false, false, false, false, false, false, "41bb352ffde8c2d8fcdb7262ef8b745dac70ddfa31bf550c4bec42e3a3d83dcc"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 2, 0, 0, 0, false, false, false, false, false, false, "f3657d949a49ec32a6a88f5b56fb5dac090d649c1f0d7c4e6ab287b9a3a8da0d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 41312, 512, 1, 0, 2, 0, 1, 0, false, false, false, false, false, true, "49936ebd2d0d926b461131d8b3137917d6daced3a9b890b052f73e2135052552"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 41392, 512, 1, 0, 2, 0, 1, 0, false, false, false, false, false, true, "5a2a931d0b9fb4711404f8753206d5c1bec4f346f110f6bb1e80c2ea61da8560"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 41216, 512, 1, 0, 2, 0, 0, 0, false, false, false, false, false, true, "66d816a2d7787fe3ee22b3d45baf12f118a0e7188f4332898441e1ea5f6dd970"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, false, "dac2a167f24ddc29371f1fb4e4f41836a60190fed269569db6071a0c1ad00a5f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 42240, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, false, "93271267b8f46653be8861f6687eaa09dd4a422c35535dbf91e52b2f70b9e598"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 1, 0, 0, 0, false, false, false, false, false, false, "319d6e9d75624f49be62ce058a2ae81abfc3099dac11a6721f681dabbebaabe5"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 42176, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, true, "f2fd716aecc637ca3ec00e2ff741cc7e6ae3ce9a4912d06739dde76622b5cb67"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 42256, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, true, "195c5d303d6a11168f62fcb791938227a59eed7bdf82b55ca92aa41ac93aa35f"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 42080, 512, 2, 32, 1, 0, 0, 0, false, false, false, false, false, true, "51ba91646c6ef3c6946103268b0cc6b990ec29375d4ae4f4cad08e781e98c060"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 190792, 512, 2, 32, 1, 2, 0, 3, true, false, false, false, false, false, "379108d6c5ce4d1647b25621def3649fc1076ce5f884fc00c070b9986ee0d169"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187208, 512, 2, 32, 1, 2, 0, 3, true, false, false, false, false, false, "4e3feb0d69b5319203d069f6c652ce553340adad8e8de98cad899fddc4058fee"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 191912, 384, 2, 32, 1, 2, 0, 3, true, false, false, false, false, true, "6fcf0f2f0849f3412f2ba8594cc97d15aa42e844c846ab9eec22f7979486c1f9"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 188200, 384, 2, 32, 1, 2, 0, 3, true, false, false, false, false, true, "b8e03236db5074966de3577ae42ae8f4a60dbea359b645dfc51516f8c595aa17"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187208, 512, 2, 32, 1, 2, 0, 3, true, false, false, false, false, false, "6acaa5f91c3bbb189183e54bae1ecd89826ad4f70235a57bd08ababe66c90f89"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 191912, 384, 2, 32, 1, 2, 0, 3, true, false, false, false, false, true, "69418051e6002c25bb81b52d855d96cf74bca01b0af779a49a87172735080847"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 188200, 384, 2, 32, 1, 2, 0, 3, true, false, false, false, false, true, "98960193018c5f8a9b0037d99f67030a3a43567d862c13e40fe5256915471844"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 32, 1, 2, 0, 1, true, false, false, false, false, false, "f1a934a2d0d84fc3ef4326ac80688b08cd84bb9554551552d55e6ce87b0cee16"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 32, 1, 2, 0, 1, true, false, false, false, false, false, "5b984e2b90f5854af4ca50d8611f7304f3a5f80697529637a8d01f68a3fbaeb9"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 157088, 384, 2, 32, 1, 2, 0, 1, true, false, false, false, false, true, "7b87d6066f6b4f701073cdd0d94066de7aa1b2a87f596ef08fcfcd628928837f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 153376, 384, 2, 32, 1, 2, 0, 1, true, false, false, false, false, true, "d9cb9730fe4c63e79bab126a1bfcc4fa02dee12ddc3ba51cd9c0229bbb91640a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, false, "3a0f099b677e0062a00bb79b59131f34f335e94e2238129f036350ee5d70bf3a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 157088, 384, 2, 32, 1, 2, 0, 1, true, false, false, false, false, true, "f018e4e09ef0a17fea57d2601bb5a819e015f4aae391538b6c91ee3f27028cec"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 153376, 384, 2, 32, 1, 2, 0, 1, true, false, false, false, false, true, "7703565f4daed7720eab6ba1256fb7c83352eac4034c14aeecd758f72f7be42e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 42240, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, false, "20385ac5db02e198e0be994de32a4db779befd665e282b1c5624c3c1b9f84eef"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 0, 0, 0, 0, false, false, false, false, false, false, "70a00b4f82753947fc19ab9a4a8727e09870423750d2c2b8a67701df82675f57"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 158112, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, false, "58c9cab0ee81d82b7b18ce892a0cf91626b5479d8bfff0cc3e2ef80c3167af28"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 158192, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, false, "46fac0f90cc962628c8270cd4fc958bcb814484595e54d9e73171ae3434551c6"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 32, 1, 2, 0, 0, true, false, false, false, false, false, "986743f49e6e57799a2c36e375e29240f9bb1ff203c492ecf4d064fe4e6bdca3"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 153504, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, false, "cc7e56ae58874a5905257d600722b64173870ccf70aa146055296fdea2c16ded"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 153584, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, false, "11102874c8a1101d356a6f649da85e2088920ff32ad9ea86cfc53884cbec2037"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 32, 1, 2, 0, 0, true, false, false, false, false, false, "5b328b0f1e77bb3c24793c076b2561439be4347a35a87a03001d483f412483d2"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 42176, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, true, "0f053ee08da23f9528bcffd31fe8dc9241027d23325dce625f25d0bd80d2684e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 42256, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, true, "ad069143b32eb9f9841a35d7ab604c6b72ff04b01b7c8de5a8c0d3edc41ebd45"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 42080, 512, 2, 32, 0, 0, 0, 0, false, false, false, false, false, true, "d95635d0a4bd9433b8f89ae675c02f31f179b0b81bca3d2bd007c75432a1c1a6"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen", 159232, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, true, "7dfb85f4d87440eb1027b5d325fc9c244b44cd076f582f4b7c49bb64c336b03b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 157088, 384, 2, 32, 1, 2, 0, 0, true, false, false, false, false, true, "b1d091041538ff7c60ea59f9214af7c773e85a1c140ec307249bdd582ea7c4da"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen", 154496, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, true, "59949e622874f90d145d1d590079370564493ffeee6e937f25cd89dbde1fe38a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 153376, 384, 2, 32, 1, 2, 0, 0, true, false, false, false, false, true, "7625565340d3c83e3d57e1532b4ead59f242483cbb44e8540377d6d24c6b7d21"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen", 159312, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, true, "089b245266bf05b78d39bfbb1594711b9afefeddc12915772dd5b6fb54366946"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 157088, 384, 2, 32, 1, 2, 0, 0, true, false, false, false, false, true, "fe3d89b3e7f75083fb41988b6cd4e56b50c1b3fec375f197b3d735742355f45e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen", 154576, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, true, "3043b74b6065961ff599a915634d0b94ef84d8123ebbfcf2ec61a223a06575d1"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 153376, 384, 2, 32, 1, 2, 0, 0, true, false, false, false, false, true, "9631f89c22ca64ad0fe5eb3e40fac5f666f24c7f0652caf03422b59b41d9119d"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 190792, 512, 2, 32, 2, 2, 0, 3, true, false, false, false, false, false, "6ff31bdf2fa48438a0ce99218c76a617ff782089ad1eaffbdc197ba10395bf7b"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187208, 512, 2, 32, 2, 2, 0, 3, true, false, false, false, false, false, "48c9e58e3857dfa86c9d489dbe6ae7ac18846569e52177c1494966f37fa2586e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 191912, 384, 2, 32, 2, 2, 0, 3, true, false, false, false, false, true, "a75163e96d3ed677870378a62adbd3595c0f2252154a887a2f09ddc46cd3f6b9"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 188200, 384, 2, 32, 2, 2, 0, 3, true, false, false, false, false, true, "3e0cbfc1afb10623a21392c542aeea81dd84a6238637829cac7764d10a4ceb7f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 191912, 384, 2, 32, 2, 2, 0, 3, true, false, false, false, false, true, "54d12a2865cb091abc301e6b8ac2a4a71f7bdd72012dc02b433f24658066915f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 188200, 384, 2, 32, 2, 2, 0, 3, true, false, false, false, false, true, "524c76a1673b00c24b43d2767fdf5ece7b4a374187ff2254d309b6c3f8e487f3"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 32, 2, 2, 0, 1, true, false, false, false, false, false, "e6103c37823ddb650aaed9b43f1156aa0433bcc105514d91ea94080cbdcc7eb4"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 32, 2, 2, 0, 1, true, false, false, false, false, false, "360a788dad0c6cfe9927ad5a32507f1072d9f3214d5b28f278d8b30228267a3d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 157088, 384, 2, 32, 2, 2, 0, 1, true, false, false, false, false, true, "2fd0eee537adf72ce5b08ddc14240cb0152268114a5031b5f4eaec95a509b428"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 153376, 384, 2, 32, 2, 2, 0, 1, true, false, false, false, false, true, "f87a87c1dcbc09cbfd1dbf63b309545ac9846a8a84c0f27e6804efd88413f56c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 32, 2, 0, 1, 0, false, false, false, false, false, false, "1f5335407eb63806674ff80c5dbdbf2c301444c7284a55bdadfe0c24d956cdcf"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 157088, 384, 2, 32, 2, 2, 0, 1, true, false, false, false, false, true, "179bdc64355512e0726328abab087ad540f6077e8fad9d08571189a617bab075"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 153376, 384, 2, 32, 2, 2, 0, 1, true, false, false, false, false, true, "2ba066c2d0bc468b4a76862d8b7916690fe14ee1904fc91c93f3f724ffd11f2a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 42240, 512, 2, 32, 2, 0, 1, 0, false, false, false, false, false, false, "074888f713ca97b28d3230b895a585dd64d9604160ea4c7e61cb092e77699b9f"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 2, 0, 0, 0, false, false, false, false, false, false, "ace66dd9ab152106d14a9452117cbe5451f97d8f47b4c37e0a7ca4ca9334f69c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 158112, 512, 2, 32, 2, 2, 1, 0, true, false, false, false, false, false, "4364bb784f44804aab62687056374ff1a1ec5eb77f97290c94c1f872ff7329e8"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 158192, 512, 2, 32, 2, 2, 1, 0, true, false, false, false, false, false, "c93f52c5dbd2ff1e52af81836e7475f7229bdbe35f1cc842228b6d55b50142ea"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 32, 2, 2, 0, 0, true, false, false, false, false, false, "58eec25f3770656f5f50734b80a396919f4ac1f35e65cb7facfdf52b2c149218"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 153504, 512, 2, 32, 2, 2, 1, 0, true, false, false, false, false, false, "349264de264b20c133a962559db7f5711517b77dbe5ada896b864aa7aebd77f8"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 153584, 512, 2, 32, 2, 2, 1, 0, true, false, false, false, false, false, "1350909c59975455556c3b88a79853f6ec1a22225f020e38387c026309e729cb"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 32, 2, 2, 0, 0, true, false, false, false, false, false, "d66698f7cb57cce460ebc9232cf432301b75ff98ec80d6f318ef8d5bb441e71c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 42176, 512, 2, 32, 2, 0, 1, 0, false, false, false, false, false, true, "2399dce4b39d43839870119a3578fb54e4134a66196a5b2c8f4ccfe7d63aa7fe"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 42256, 512, 2, 32, 2, 0, 1, 0, false, false, false, false, false, true, "86a905cb9c9a194f1118ae4336f96088bdfab410b13a62282bc8ff44b47e0bfc"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 42080, 512, 2, 32, 2, 0, 0, 0, false, false, false, false, false, true, "db8010a119ff85e3eca0cbeb5f01825aed2fc679e48ebdcccc88eea459f9413b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen", 159232, 512, 2, 32, 2, 2, 1, 0, true, false, false, false, false, true, "17d3bd5e4f9f039585c340a2da1d149574857b2262f8967a6106699071ed8c7a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 157088, 384, 2, 32, 2, 2, 0, 0, true, false, false, false, false, true, "7f3a95254b0268f02946a77e9983f3a92fa29bdafa0eb5a8502248bad90f122d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen", 154496, 512, 2, 32, 2, 2, 1, 0, true, false, false, false, false, true, "779f7f8e2c901acb35ea9b3c668ffa00757f9552bf96b8e305e9c0119b4ea414"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 153376, 384, 2, 32, 2, 2, 0, 0, true, false, false, false, false, true, "447471e1e4b7c5122fa3de4f94517d809c5946982138e01eadfa90a74caf0396"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 1, 0, 1, 0, false, false, false, false, false, false, "7dcc59ffdd068ff6255e51cb8d12921f7610fe7653ac3bcca8499812cce114fb"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen", 159312, 512, 2, 32, 2, 2, 1, 0, true, false, false, false, false, true, "ddffb9534cff004d8e36fa8b30eac49a272e45a0ea0a486c2115fabe63b83efb"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 157088, 384, 2, 32, 2, 2, 0, 0, true, false, false, false, false, true, "e8a70c0999a13d37881e0ddf89518ab00fe9a90c7103a342d686d0043c70035c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen", 154576, 512, 2, 32, 2, 2, 1, 0, true, false, false, false, false, true, "2cb05bec2ce33584ba090c3311a83e3c95fc43ec03637d4dd34dc5939c1adeeb"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 153376, 384, 2, 32, 2, 2, 0, 0, true, false, false, false, false, true, "c5efb4bf753db1bc5958181158a4d71042de6d7fd924adafb9de52d4d28a6495"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqQ128Kv128PersistentContext", 82336, 512, 1, 0, 1, 0, 1, 0, false, false, false, false, false, false, "7ad3d08f998272e58b1d7d062eeff0cfdf159446877d0c78b91531c3936173b8"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 1, 0, 0, 0, false, false, false, false, false, false, "32ee655aea9a490b12939b1458ee2a44c8761d3f28d4481bd6e58f82066be0e7"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 82272, 512, 1, 0, 1, 0, 1, 0, false, false, false, false, false, true, "b93e347526ebd1aa4c5adf68f71d33a4df64167d30784f3f969b57ffd29bff01"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 82352, 512, 1, 0, 1, 0, 1, 0, false, false, false, false, false, true, "8fe08e0a53392e4f442d434d8017712ab1fbc06e2702316797696592f03c3a29"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 82176, 512, 1, 0, 1, 0, 0, 0, false, false, false, false, false, true, "47f57b97b9e828fdaf9121399b48517d841aad99ca05015280e01f75bbdbd6d2"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 0, 0, 1, 0, false, false, false, false, false, false, "cfb119df5401c3af9e0bc9b7cbf5866b9cd2af09b661a0ab6a6c2c175a7e2090"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqQ128Kv128PersistentContext", 82336, 512, 1, 0, 0, 0, 1, 0, false, false, false, false, false, false, "3c1b268dd3423551e9d89b5722d8db83e6941d4606b2d85fa11c13ae5a96aca3"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 0, 0, 0, 0, false, false, false, false, false, false, "8f10a6ea966d528c355d270c1c5ba1625da6c8339043e63edd67a7bfed9b28b7"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 82272, 512, 1, 0, 0, 0, 1, 0, false, false, false, false, false, true, "ce04cffc4cecfe4d1082238c3e0c71a359efc28ce5f30a027ae3d4c3e1c4d61c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 82352, 512, 1, 0, 0, 0, 1, 0, false, false, false, false, false, true, "2a316f25f9af5123d8255f6f277639558f594d6f5f91451792d5004e9f096854"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext", 82176, 512, 1, 0, 0, 0, 0, 0, false, false, false, false, false, true, "25e8fd268d721fb9b014a8f1dd3ffaeb0c519dbf4edbb0e296754fabd71230f9"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 2, 0, 1, 0, false, false, false, false, false, false, "ceb641f07d4e1f3694d7c5eba9d26d529049d1879415def174e0bcde5c178b0b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 82336, 512, 1, 0, 2, 0, 1, 0, false, false, false, false, false, false, "054777aa737d250a5fe0cc484b1ac740b862c60f8b0e6f3fd0bc48d20f9ace1e"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 2, 0, 0, 0, false, false, false, false, false, false, "0d44e078c9cc7863b08d7a1e027549c763ed9cd49f4489ffb9c7035ef88060ad"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 82272, 512, 1, 0, 2, 0, 1, 0, false, false, false, false, false, true, "7856bcee8544ee576267e30114f98a1ed804378ea673f2d34b357fba433dc611"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 82352, 512, 1, 0, 2, 0, 1, 0, false, false, false, false, false, true, "a1ee6e9329eb58654e96e2d4c6cd4a04a2d0e33c9711aa5ec4b077183103ab8f"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 82176, 512, 1, 0, 2, 0, 0, 0, false, false, false, false, false, true, "f328fae48efe8843c3c0bf08342bcdf66cdc9ce296f5f2d36677187926f0fefd"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 165056, 512, 2, 32, 1, 3, 0, 3, true, true, false, false, false, false, "cd0adf1dcbc29f2e791b551c8ce535dfc5850ca93b161c57c734e1c50a5819cb"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 191672, 512, 2, 32, 1, 2, 0, 3, true, true, false, false, false, false, "d15df367b932633df141ad56e5cfba8c23b5d4987eae05c192d94a1ed3c09fc5"}, @@ -4938,135 +5982,163 @@ static const TllmGenFmhaKernelMetaInfo sTllmGenFmhaKernelMetaInfos[] = { { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 32, 128, 32, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen", 167088, 512, 2, 32, 1, 2, 0, 1, true, true, false, false, false, false, "74c7c1388a2463db94d86349a9710eadcd0f770b6e70db2aed66a389d483cb53"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 64, 128, 64, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen", 156848, 512, 2, 32, 1, 3, 0, 1, true, true, false, false, false, false, "eb62613e96c866dcafdeaf422fbf1811d058c02b1ef38ab71150e13e57465c9b"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 32, 1, 2, 0, 1, true, true, false, false, false, false, "91e80bf7f8982747a54ff57780f4559e6bc19dd23fc336e951e78a548e144329"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, false, "b5b643effc937dda1e03de66bbd26fa519772d58e8a0ace38f796e4018ac2f2c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen", 181520, 512, 2, 32, 1, 3, 1, 0, true, true, false, false, false, false, "c21f6c953931674e1f5931718eadc33bd8cdad75b8d11f0c4cbbed34be6460b2"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 83200, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, false, "4cf6c49f4162a60a744eef144da2d2a53ad15a0523f98d8ccf1980996952d6bc"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen", 181600, 512, 2, 32, 1, 3, 1, 0, true, true, false, false, false, false, "088b7213939e3f9e1fd5b4d90ceb2f8fcceede8436bd1cdc68c848c7a6c32322"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 1, 0, 0, 0, false, false, false, false, false, false, "e930a859884f48aefc725e969fd5089ecff27e54ae45ac0e365304e44d0ce6a4"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen", 165024, 512, 2, 32, 1, 3, 0, 0, true, true, false, false, false, false, "776a27a1854f8e2d9c459ae6147d3d29df3b69066ca99e4c533d11340f79295c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 160016, 512, 2, 32, 1, 2, 1, 0, true, true, false, false, false, false, "56d3fa19d08960f599a387755e2a4ad312eb58e4dd248bf06e752841e7e9512d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 160096, 512, 2, 32, 1, 2, 1, 0, true, true, false, false, false, false, "c047425462db41aa9814baa4ede9ab54bda78f49960103ecf7f4bfc5554cd368"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 32, 1, 2, 0, 0, true, true, false, false, false, false, "6d33def0d783be5d9b20be54df2a92147777a88ac1eae287fea1c960b77c0f59"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 32, 128, 32, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen", 171280, 512, 2, 32, 1, 2, 1, 0, true, true, false, false, false, false, "64c2e8663fa0b98d1c3e5e6382ea395363df9d0277d279783c8b9931e7226748"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 32, 128, 32, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen", 171360, 512, 2, 32, 1, 2, 1, 0, true, true, false, false, false, false, "df17c0d35862e7251385005564a0f51ac9f30d1a225a1c7a5653ee9d3421ad25"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 32, 128, 32, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen", 167088, 512, 2, 32, 1, 2, 0, 0, true, true, false, false, false, false, "23da366ed711153f492b0292011569be7084f1627563345722e0146708c8b7ee"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 64, 128, 64, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen", 165136, 512, 2, 32, 1, 3, 1, 0, true, true, false, false, false, false, "7c77f52b822a3ac183c5fa978f4aede79ad37b83af71347383eb77dfdcd38c5e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 64, 128, 64, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen", 156832, 512, 2, 32, 1, 3, 0, 0, true, true, false, false, false, false, "97f3be5707262fbe3c7ca01a3407f7ae78281c2a2ebd9ed26f3483932594d6a8"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 154384, 512, 2, 32, 1, 2, 1, 0, true, true, false, false, false, false, "c41da070378701e383437d365e115db937e7c978e34a9794ed5ad13abe59e7c1"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 64, 128, 64, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen", 165216, 512, 2, 32, 1, 3, 1, 0, true, true, false, false, false, false, "e1dd2d31b1206fce2912ea7e9f73e9d3a97bebe20c561ca81b2dcf5ab84de978"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 64, 128, 64, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen", 156832, 512, 2, 32, 1, 3, 0, 0, true, true, false, false, false, false, "64f7ef64795ba92a15f58c766d1543c82957e41bc80360a80e9d9bfc13a9a3fd"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 154464, 512, 2, 32, 1, 2, 1, 0, true, true, false, false, false, false, "73b848f358478a5b758c08d6fb1641ab3aa8de723e0f5547a68b5665e496b08a"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 32, 1, 2, 0, 0, true, true, false, false, false, false, "2b3ee015b8b9f6ff233fe68808cce2746f5142af4174806dfb8350da095c4dfb"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 83136, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, true, "d467ac05fb310824adef9e372b94ab9c3c10eba242c2d552857859c797c24ec1"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 83216, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, true, "b7a6dc8f56c52d78a5cc8ed03ea8457510b5cecc530cf21bded305d55df6f786"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 83040, 512, 2, 32, 1, 0, 0, 0, false, false, false, false, false, true, "678ef7d123c6b6e5a3247adc8f03daa074bc912832fb2a5c555a5c172c45071f"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 165056, 512, 2, 32, 3, 3, 0, 3, true, false, false, false, false, false, "5a2446dadecee92fb7f8a698e5f682e7d3dba04ceacb9b0e3ba762371742a4e2"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 165072, 512, 2, 32, 3, 3, 0, 3, true, false, false, false, false, true, "b96ded67a32e42e722f34911a4c9c20032f91ddea09f772806ac2d508bc887ff"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 165040, 512, 2, 32, 3, 3, 0, 1, true, false, false, false, false, false, "6c56bff58b11a2f7c5649e69459bbfde916e447059294ee2c707f23193f16502"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 165056, 512, 2, 32, 3, 3, 0, 1, true, false, false, false, false, true, "396762a3b63863e6c52d0fb517b8bc48fb4653c561fca95d208ffc6fcc1f5189"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen", 181520, 512, 2, 32, 3, 3, 1, 0, true, false, false, false, false, false, "7aeffba704f36b87cd29ec95cda2f55a51ab87ab8e476c9be4c35d5816bbf2cd"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen", 165024, 512, 2, 32, 3, 3, 0, 0, true, false, false, false, false, false, "f006fcb456d29ddcdf4a1f7fd08ad8798934b8e2be7f9c09f3a7a930a0b155b4"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen", 181536, 512, 2, 32, 3, 3, 1, 0, true, false, false, false, false, true, "923a00e50edced98431b7f850e9de07f865a8dd224a40963df3015a32416aec0"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 165040, 512, 2, 32, 3, 3, 0, 0, true, false, false, false, false, true, "516933955425b87fecbf86857ea198e06156b5caa4f0a7068f66accc84a4dfb7"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 192792, 384, 2, 32, 1, 2, 0, 3, true, false, false, false, false, true, "d3a05aae8f4f1b8bd3512c2cd58644dda7becc2e283cf5fd390e3aa690738794"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 188056, 384, 2, 32, 1, 2, 0, 3, true, false, false, false, false, true, "30f202446c52cdf3d66480daead3e60a8609dc8284fa23dcf46195a476b26c5b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 158992, 384, 2, 32, 1, 2, 0, 1, true, false, false, false, false, true, "470de3d5fdaf320bbda1c7f4c60aadd77ee6bb1a74d81d22611d666906c9a1bf"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 154256, 384, 2, 32, 1, 2, 0, 1, true, false, false, false, false, true, "c26795663156bff98840fa4fd2ffa863018de5b5ca0d973d54900ce966107b25"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, false, "23b182af7d1444b9bca06907dc1c4551081929b82d9832390fc138842375daac"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen", 181600, 512, 2, 32, 3, 3, 1, 0, true, false, false, false, false, false, "6212f8fcd29237fa682f3806b3d23e9509363fda660067021262cc54fc7afc74"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen", 165024, 512, 2, 32, 3, 3, 0, 0, true, false, false, false, false, false, "d8c3ebdb5ccceb6b6e5d6ef170d06bc44ef98d38455e3ee3484e7fd74210965d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen", 181616, 512, 2, 32, 3, 3, 1, 0, true, false, false, false, false, true, "dbf9efcd142f4b0378e8da2296cdad007cf1cfcdd820888dfb14cac340462536"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 165040, 512, 2, 32, 3, 3, 0, 0, true, false, false, false, false, true, "4c392073a6a9ac117c091e3d0d3aa73ff40e6897f5aeaf391a9b450f9207f111"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 192792, 384, 2, 32, 1, 2, 0, 3, true, false, false, false, false, true, "eb46746b516dbe56784d4178e4e6a691990ef8b5a653e6ad9994af844d0c5aa2"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 188056, 384, 2, 32, 1, 2, 0, 3, true, false, false, false, false, true, "dd2ed67a1827f1d5f11167129fec9b68f13ce48a75d3869dd7dd82b494789969"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 158992, 384, 2, 32, 1, 2, 0, 1, true, false, false, false, false, true, "5ff8601bae5cca913530889558b1ef1e65c357c91ee09b76a16499c462d60d5d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 154256, 384, 2, 32, 1, 2, 0, 1, true, false, false, false, false, true, "9ae7c18e24731b0a1bba4fa5ed35d0eaf5c8c8aa5fd51b7e45265200c855a3a5"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 83200, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, false, "52ae94a6ea0676726ad99ffcb64c758484bad14cfb008fc20c825bca8c2e3733"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 0, 0, 0, 0, false, false, false, false, false, false, "e50d0ac7582ae1977d33af0f135d7f8968b6b69be8a489db41db767274949f9a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 83136, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, true, "9bb9d74fd98fa084ab060c21ab0f823ffae6114ce06508baf7c6bc9d7b290ad3"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 83216, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, true, "43775f7aa16b652dbfc049bafaed398384216d5ae5061be9caa61dbb1c76ed3c"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 83040, 512, 2, 32, 0, 0, 0, 0, false, false, false, false, false, true, "cb7a0493e9c8e6f0f6c6bee2eda7feb30144ea5911056939e710987fd41b5224"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen", 161136, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, true, "e66a8f8f61ff42f64c3bcfde50253ed578f8a4f9abd4a13590e43f734842585d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 158992, 384, 2, 32, 1, 2, 0, 0, true, false, false, false, false, true, "2f5fa18551fa7b767b86d550ea99f7958c7b0fc2911383c4a6e84e84ef8ef7a7"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen", 155376, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, true, "bebf55d33b0ab713e1e8f56b8a0abee96861101be8e847149d71e527e11914d1"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 154256, 384, 2, 32, 1, 2, 0, 0, true, false, false, false, false, true, "ea2f1968ed469495d72531c2f24da2a8f0b8eb277f860fa11ead1cec06c8a0fa"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen", 161216, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, true, "ad377598ceef3af99cc62944aae00ed842a63948fc47b36160552337fa9021f2"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 158992, 384, 2, 32, 1, 2, 0, 0, true, false, false, false, false, true, "8f9ebc9b367c6f4bc382450ec2686736c5b861008c68b5b6e2220e260d397015"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen", 155456, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, true, "9e7787c741819a7a091bff5d1db371fb2dc82653bab59eed1599fdac4e72483c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 154256, 384, 2, 32, 1, 2, 0, 0, true, false, false, false, false, true, "4180cf24747a90a2f99182b49e1c1c7728d7ecbf572a7d8124a4d70072f67784"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 165056, 512, 2, 32, 2, 3, 0, 3, true, true, false, false, false, false, "d467cf738af0a15d5af3ce8ff565f911c2ed1ef4a50b924d831309f9fa1fa99a"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 191672, 512, 2, 32, 2, 2, 0, 3, true, true, false, false, false, false, "bd68e585ca590307ffe981fd10d8e2ed887f6fc643f3237a49679e9294aaeb41"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 32, 128, 32, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen", 200888, 512, 2, 32, 2, 2, 0, 3, true, true, false, false, false, false, "c73b875cb6a8a8b19fca63fe76f71542fd596ce177df69f370444801e5f3740c"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 64, 128, 64, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen", 156864, 512, 2, 32, 2, 3, 0, 3, true, true, false, false, false, false, "02d5f5c04d005d385cf8b57c6c311295c66674f4c181fa7c0492c7bd06bb2c2b"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187064, 512, 2, 32, 2, 2, 0, 3, true, true, false, false, false, false, "41bff44ea9e5682706e27dc0529be16e808747c6ffcfcf902697567df48b8f26"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 192792, 384, 2, 32, 2, 2, 0, 3, true, false, false, false, false, true, "37c6855b4c08780f5fd94a356c0bc4a1be2a9e94edce6a16bba8f9c317c6bfb2"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 188056, 384, 2, 32, 2, 2, 0, 3, true, false, false, false, false, true, "187261171ed4d7284d32e71574d88366d25731de31d7f00ea0cbc8b8a34b0c5f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 192792, 384, 2, 32, 2, 2, 0, 3, true, false, false, false, false, true, "00201e5f4c9b68e0c2cc3e4b7beb2d977401e01cc776ec65ac6d9b76cbadb473"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 188056, 384, 2, 32, 2, 2, 0, 3, true, false, false, false, false, true, "a8a85e16769d1a4f28879820960100a1870cc0d492cb5ee8dfcda08ea605e5ab"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 165040, 512, 2, 32, 2, 3, 0, 1, true, true, false, false, false, false, "068a8cd0722d26c97bca4a8a5161ac1d83be369f041b67eba1488192dda98231"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 32, 2, 2, 0, 1, true, true, false, false, false, false, "14912c09fb2b204ff835fa7731fe64e4667c949d4e135a724445f199fd7318a3"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 32, 128, 32, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen", 167088, 512, 2, 32, 2, 2, 0, 1, true, true, false, false, false, false, "1751a235b7b4634f8409c5ae86fc60ec463bd9ca38c6238145c1ba7606cd5ff7"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 64, 128, 64, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen", 156848, 512, 2, 32, 2, 3, 0, 1, true, true, false, false, false, false, "fe27a1f2a2c42657a8af339cce5c3b1db3005ef44950d5b95637bfa0e5788655"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 32, 2, 2, 0, 1, true, true, false, false, false, false, "746d0e9a66a9d65e560f2bfd2e1a4b2ba2082e6052a3afafdb10dd5ecf70d940"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 158992, 384, 2, 32, 2, 2, 0, 1, true, false, false, false, false, true, "bb3e6e83ed005622c8f859a539f334088c4f192f349c1fc0d2cc716d265dd827"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 154256, 384, 2, 32, 2, 2, 0, 1, true, false, false, false, false, true, "37bcbf13deb79b42852eb49196d2c7dedee80b03ed6393ac1659cf88428be057"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 2, 0, 1, 0, false, false, false, false, false, false, "0721b85e8170c9c523a90abccc99bc03e8c478dd514a09766781d5d385989ae0"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen", 181520, 512, 2, 32, 2, 3, 1, 0, true, true, false, false, false, false, "015f5c2c5be47e6caee34bf27101b998abba1d03c86edb1cab0ff63154683d01"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 158992, 384, 2, 32, 2, 2, 0, 1, true, false, false, false, false, true, "3738b6088a29d22b762b6c0e1609ded4f098bbbaac1d64bb9f7861df89456f2d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 154256, 384, 2, 32, 2, 2, 0, 1, true, false, false, false, false, true, "e461c7fcce2f3cdd616b5cc9394956546d9717c93768cf90716c4c0d107cdc4f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 83200, 512, 2, 32, 2, 0, 1, 0, false, false, false, false, false, false, "410a838746745c14e0271b1804f39f2a2c8afbee817d0b26966ea0bfd82343e8"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen", 181600, 512, 2, 32, 2, 3, 1, 0, true, true, false, false, false, false, "7f8487abd5d5c6645bd4c70eea99c6bf221fb3a924b2daa41cdd2bd9b32da74f"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 2, 0, 0, 0, false, false, false, false, false, false, "b8bb5d9b78330e4ad400859111e9499e46ec554bd8885368129e332584d9749b"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen", 165024, 512, 2, 32, 2, 3, 0, 0, true, true, false, false, false, false, "f90a06d92e2be0ef3804e11718fddd20bfbcdb6d2b40695559e38bb84f937864"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 160016, 512, 2, 32, 2, 2, 1, 0, true, true, false, false, false, false, "349cd55d0c2a62f38f6fcbdfe8f17cfdb9305425abed6643d55fe07465a1f825"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 160096, 512, 2, 32, 2, 2, 1, 0, true, true, false, false, false, false, "0635ae8d682b5be8f306ab81ce7eaf1241d51b75dfc3112b847a91c543c4310c"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 32, 2, 2, 0, 0, true, true, false, false, false, false, "732476780279dd61666f59e59975e84f96c4b55f4664b5c6288276f920fda8a6"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 32, 128, 32, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen", 171280, 512, 2, 32, 2, 2, 1, 0, true, true, false, false, false, false, "729817c88eed42dd52d4df0e8f46a63bad41a21f2c59092bcf36ebdcdd4c9897"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 32, 128, 32, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen", 171360, 512, 2, 32, 2, 2, 1, 0, true, true, false, false, false, false, "65f36802f48373a2bc169da58655cebc5fa59f765518a5ec7059f5da8a43af39"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 32, 128, 32, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen", 167088, 512, 2, 32, 2, 2, 0, 0, true, true, false, false, false, false, "0d78e6e87a2007771f546d1f55cd1c38ea9de8379b1e7cdaf8ce856f3774e21a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 64, 128, 64, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen", 165136, 512, 2, 32, 2, 3, 1, 0, true, true, false, false, false, false, "e4b87143e57bb0cd9d95ffeb2c60a20779a6cef4a3cb4ecf28f23de6a5382bce"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 64, 128, 64, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen", 156832, 512, 2, 32, 2, 3, 0, 0, true, true, false, false, false, false, "e738a88746c7390b6060b60337ec98aeb109a2b0c05b5684f79317b8850847d0"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 154384, 512, 2, 32, 2, 2, 1, 0, true, true, false, false, false, false, "291cc98b216ca7ebe37dfa96246d3d687d3f12dd7956c26eb0edce59be374e08"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 64, 128, 64, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen", 165216, 512, 2, 32, 2, 3, 1, 0, true, true, false, false, false, false, "70d0d9e87606959e222ca2d8f7bb0078c2edbc3083ab826b73fa9f92fe5187e7"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 64, 128, 64, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen", 156832, 512, 2, 32, 2, 3, 0, 0, true, true, false, false, false, false, "4799b19691588f24f3c015246468ac8905c0fbbdf4c0d5cacd2a2f958f79c957"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 154464, 512, 2, 32, 2, 2, 1, 0, true, true, false, false, false, false, "85cde4d841e534a0385bc6a19eec5459f8fe871cd3928302932e3a48d1025119"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 32, 2, 2, 0, 0, true, true, false, false, false, false, "d3825ae52db39401a316285178d4879c69a4930dc2779547ec41ef99511a32b5"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 83136, 512, 2, 32, 2, 0, 1, 0, false, false, false, false, false, true, "7f02e697b38f1bae37a50e7abeaf716f3e2ee003c933f1cb4825531511ba73ee"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 83216, 512, 2, 32, 2, 0, 1, 0, false, false, false, false, false, true, "53d8351641da36463c98e6575b839af89368ed1588872608da561d3b30529619"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 83040, 512, 2, 32, 2, 0, 0, 0, false, false, false, false, false, true, "324fe7f7c3ec4f4e34035e0ba266c9e67d1fa0cb16e0d10ddacd76d5b73b4cf9"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen", 161136, 512, 2, 32, 2, 2, 1, 0, true, false, false, false, false, true, "5e1e05f60cff13c0b18d7e1d064f6745087a311561e3b9143fd8a6a337058ebe"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 158992, 384, 2, 32, 2, 2, 0, 0, true, false, false, false, false, true, "047ca7affa15bc7d8f6ce2e4cec32768b48c7dbca4f8738d22e37629d01684cb"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen", 155376, 512, 2, 32, 2, 2, 1, 0, true, false, false, false, false, true, "9294d905b869cfb03e5222ba19d97ee6e5609ef81e49221df883989b8c33c959"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 154256, 384, 2, 32, 2, 2, 0, 0, true, false, false, false, false, true, "b335044b9125b836bcdf34e1772dbee1e1ab409e87ec57a4a3e27a618695af63"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqQ128Kv128PersistentContext", 213408, 384, 1, 0, 1, 0, 1, 0, false, false, false, false, false, false, "71a43ff2e1ce9f3417bdef6fece4c2ef435617492cf893ecef4456fa96914d39"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen", 161216, 512, 2, 32, 2, 2, 1, 0, true, false, false, false, false, true, "2efaefc6b7f2840ec1a2c0cf6ef06784ba68e07f71dc373d3271e4198d6efe9a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 158992, 384, 2, 32, 2, 2, 0, 0, true, false, false, false, false, true, "354a9bcafc2d2d75576aebd3edee36098c0d4d617a8e457ec47ef9a4174a213f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen", 155456, 512, 2, 32, 2, 2, 1, 0, true, false, false, false, false, true, "86098eb2ead9d7dcbdbdfed23e0321d295baa9fcccbb2b26b48156b4d8b1efd5"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 154256, 384, 2, 32, 2, 2, 0, 0, true, false, false, false, false, true, "d8d765f1f2c64f14c74dc8f34281989061c08fa993e40f2b2123b092a5b69124"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqQ128Kv128PersistentContext", 213488, 384, 1, 0, 1, 0, 1, 0, false, false, false, false, false, false, "b6f9dfda0165e70cafa3ac4aea4b0f5208ee83df3d7652d4ccdc0dea09aa0fe2"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 1, 0, 0, 0, false, false, false, false, false, false, "644ee49e168310a2ecbffe865e1b0cedf4c932974de9c963aad284d45f89bdb1"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqQ128Kv128PersistentContext", 213408, 384, 1, 0, 0, 0, 1, 0, false, false, false, false, false, false, "b84146200088eec69afacb914e0db96ab670a79109f14b79c471382baf9326aa"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 213504, 384, 1, 0, 1, 0, 1, 0, false, false, false, false, false, true, "f47d29d4bb65c9c727f6ae0e55557142ab07574991a47b4762ac64be2707c678"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 213328, 384, 1, 0, 1, 0, 0, 0, false, false, false, false, false, true, "629983bcaa30fc561a2312daa4dcf68bb46deb41951bffac38646ebb91cae676"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqQ128Kv128PersistentContext", 213488, 384, 1, 0, 0, 0, 1, 0, false, false, false, false, false, false, "2ede7788074fa03b725d0f9755180de963852c2f887f3c3837fe2d3e145d8cd3"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 0, 0, 0, 0, false, false, false, false, false, false, "dc980aeb7d291e18e27a1570f959c47c7d31ee467b6de38b997f45815bcd4143"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 213408, 384, 1, 0, 2, 0, 1, 0, false, false, false, false, false, false, "82d56c1f202116a2dd4e45a1aa042a9468d737a334063792b8a0ed5106704c41"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 213504, 384, 1, 0, 0, 0, 1, 0, false, false, false, false, false, true, "bde91721f1c4e8fc4d8f8e7fe2830485d572e5ff2360bc9df69d2edce1690691"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext", 213328, 384, 1, 0, 0, 0, 0, 0, false, false, false, false, false, true, "4f4fa447dc92722741652124ccce4de0a58335ef7a63d543f24face91321f5ae"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 213488, 384, 1, 0, 2, 0, 1, 0, false, false, false, false, false, false, "67496863e95e398d2669e7c9ff5002ea0af867a71693e68f5cd87b70a099f797"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 2, 0, 0, 0, false, false, false, false, false, false, "516e345c5442923664011b2c19cfffff60a4dd254a4a975ef4dd1c77913b27a9"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 213504, 384, 1, 0, 2, 0, 1, 0, false, false, false, false, false, true, "460ca7574d2fcd87f75fae1b35333c8f3a08ecf4b57477f3ca6f517e35f5911c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 213328, 384, 1, 0, 2, 0, 0, 0, false, false, false, false, false, true, "335d2ce1b66a4adfa5cbd80b88a7eda05da3028f1785f2f25486ffc31270978b"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 214208, 384, 2, 32, 1, 3, 0, 3, true, true, false, false, false, false, "3d7c47fdf858821d33494f623124733a10367363d6534040a2f41644eccd7cb3"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 195256, 512, 2, 32, 1, 2, 0, 3, true, true, false, false, false, false, "52c87b8a44892737168c9eb11847f4d69fe8f862400373d401987667938bfda4"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 195256, 512, 2, 32, 1, 2, 0, 3, true, true, false, false, false, false, "db247391535be62b9e2cf4769f18bb23b9e26d25fafc5d621d51064e911c7046"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 32, 128, 32, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen", 208568, 512, 2, 32, 1, 2, 0, 3, true, true, false, false, false, false, "0727976c1779978ac7a92ff0fb134488300c4751f17e2a4ebacb92093053f21e"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 64, 128, 64, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen", 181440, 384, 2, 32, 1, 3, 0, 3, true, true, false, false, false, false, "00e16eb7750dcc16b9eef25e639bfc554859a297ea127d2c5887e1fc35fc227b"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 188600, 512, 2, 32, 1, 2, 0, 3, true, true, false, false, false, false, "0cb752d4a3223be72b32bddc55c1f022eab20b2bc0bc3a94002b6fb0ad83cb05"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 214192, 384, 2, 32, 1, 3, 0, 1, true, true, false, false, false, false, "824c770561374b5ec886fa0e0539c6e4fd53246a77f8bcca10bd8177b756034e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 32, 1, 2, 0, 1, true, true, false, false, false, false, "b930f634eefd978a264d3deefb4c2486247029ebbfa98cfcc6206ed9c7466370"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 32, 1, 2, 0, 1, true, true, false, false, false, false, "739bb6a0322497f0b3d1ad8c44a4ccdf1ee12c55f9d554b12348c7969ff0455b"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 32, 128, 32, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen", 175280, 512, 2, 32, 1, 2, 0, 1, true, true, false, false, false, false, "2428e325e6f5123d0d144669f28ba81c05a306b15b6e04cc01a8b7c224c8584c"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 64, 128, 64, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen", 181424, 384, 2, 32, 1, 3, 0, 1, true, true, false, false, false, false, "dc50799ffb22027f0f4b9ecea6170525c06e7fdf42de7a5b6ceb9476e431092a"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 32, 1, 2, 0, 1, true, true, false, false, false, false, "3141040a04a8fcc07dfbf2fa15742f4a42e48f07f6bf17d2032fcd9a860f5c1c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 32, 1, 0, 1, 0, false, false, false, false, false, false, "e1803263271a9655236a15e43616feda5681fb8a3a5a459d8831ccff5c006136"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen", 214272, 384, 2, 32, 1, 3, 1, 0, true, true, false, false, false, false, "3af2cb08c22726e203e5907c32077648c0fa2779a2bb72f4221a5414a314e502"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 214352, 384, 2, 32, 1, 0, 1, 0, false, false, false, false, false, false, "9673754756bba0bdca5915fac9916709f47472542290eb8e9bad3a5af48d25da"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen", 214352, 384, 2, 32, 1, 3, 1, 0, true, true, false, false, false, false, "d738094f68d868e2895903c0a2a7a821f2a3f5816b0765be3a5f15923fa035fd"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 1, 0, 0, 0, false, false, false, false, false, false, "dc48d107d1beb85b3a0528578f09e3df982c67b3db1453273080918614b7018b"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen", 214176, 384, 2, 32, 1, 3, 0, 0, true, true, false, false, false, false, "e072755b6800ff69a723fbe56dee8dee7d984baf5475d2959bd25ce016599af5"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 164112, 512, 2, 32, 1, 2, 1, 0, true, true, false, false, false, false, "b51d8f87e1e6ba07b1474fd43bb2d8911f7dd986a8b714d5ab04c638c7495816"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 32, 1, 2, 0, 0, true, true, false, false, false, false, "0071b39ac1559afeb7d56ca64f11851a317b260b02bbe7d50b3acdedb307edaa"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 32, 128, 32, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen", 179472, 512, 2, 32, 1, 2, 1, 0, true, true, false, false, false, false, "06326433b053a2ae4c226793a70782e29790a7dd308dad33d72fd0d2880e2133"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 164192, 512, 2, 32, 1, 2, 1, 0, true, true, false, false, false, false, "b4f9dc5d4cf6ad7c48c506f07789f0fc86815519634eba72f637c509eabdcbdc"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 32, 1, 2, 0, 0, true, true, false, false, false, false, "6e03d02b2207290a04135541e57fb9295f46d7f8e0b47373e1f9880cff75c28b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 32, 128, 32, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen", 179552, 512, 2, 32, 1, 2, 1, 0, true, true, false, false, false, false, "8ad2ced65d2206dd986c368fe8d38e729ab06853f662c8bee8ab1f945ab7cf12"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 32, 128, 32, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen", 175280, 512, 2, 32, 1, 2, 0, 0, true, true, false, false, false, false, "e5691fc3e80eb66c97eda91dfb2ffa0b3ce16764f840255187b4f2ca5fd41b49"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 64, 128, 64, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen", 181504, 384, 2, 32, 1, 3, 1, 0, true, true, false, false, false, false, "1f71a2031a18ae530442ced9b82cd18679203602f4406e3e8999bbb4a84dd6cb"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 64, 128, 64, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen", 181584, 384, 2, 32, 1, 3, 1, 0, true, true, false, false, false, false, "e0b15ee273e935f555dbd95d4c343ab9067cbcbeb530bd7c5f4f715ac6f04cf8"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 64, 128, 64, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen", 181408, 384, 2, 32, 1, 3, 0, 0, true, true, false, false, false, false, "90da0886efd4423db40703ce744b305c7ff9f61d0622ae8e7bd3b49559874fb7"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 156432, 512, 2, 32, 1, 2, 1, 0, true, true, false, false, false, false, "0946d867d25b640776e86b07e9b1fc78885249c832eb45bd4f3479c3667acd73"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 156512, 512, 2, 32, 1, 2, 1, 0, true, true, false, false, false, false, "e669e38c13c9df1f0bb68ab00a8c9a8559a68fe359efd918f3891769ae7d7c26"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 32, 1, 2, 0, 0, true, true, false, false, false, false, "efa63b6e96af05ad05c273e21c4b6ea3cd7a2bdf252b7ba6c595e3685106992b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 32, 0, 0, 1, 0, false, false, false, false, false, false, "792fe622013864ee464361f04558559364fc76168978daabce2f669eb4041dab"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 214368, 384, 2, 32, 1, 0, 1, 0, false, false, false, false, false, true, "b27b6a6d90d8188b7e7667b3b8c2f92b764a4320d0b4a9351d293bd64d17ccc9"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 214192, 384, 2, 32, 1, 0, 0, 0, false, false, false, false, false, true, "466ce09145bc8572a7c6b014ac8e04c48dfdd73f41fba9ad75b5605a7f3c52d5"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 196376, 384, 2, 32, 1, 2, 0, 3, true, false, false, false, false, true, "1fcd5f2f3037f78b1c5ca4ca59ba982d8400f74a1365c1df32e960849c4c28d1"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 189592, 384, 2, 32, 1, 2, 0, 3, true, false, false, false, false, true, "b99040bfdf8c463a465fcb9d5599d4b9fa31ea226fc1c23a7f1481e8f4690e47"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 163088, 384, 2, 32, 1, 2, 0, 1, true, false, false, false, false, true, "d334e988d1c3821da05728c8c10d8cf451eb1ecbadbbbad09c1dbebc6d5f8295"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 156304, 384, 2, 32, 1, 2, 0, 1, true, false, false, false, false, true, "56c9b508347114a8c72bc24d4f59af2c56282516654120714ec89a973288fa3e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 214352, 384, 2, 32, 0, 0, 1, 0, false, false, false, false, false, false, "e6b1def62c83c6ce74ebd742e29c29fa434d83d851f5aaf5aa23515c9c596308"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 0, 0, 0, 0, false, false, false, false, false, false, "6f83610d9f0f0183cbc53dae759280134ae1a7a87c33362f5065fa11479b7dcf"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 214368, 384, 2, 32, 0, 0, 1, 0, false, false, false, false, false, true, "6592194b3d3757b87c465007ab28d9c6e1a26cc170c5f6c3b976ca7077b50bcc"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 214192, 384, 2, 32, 0, 0, 0, 0, false, false, false, false, false, true, "7c8bb50f9131fee3374979404cb7ea177cd88713ad475de4dafd747de9ef2dc0"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen", 165312, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, true, "e3c3d2b94a92ed0f03982dedcfda977fd2c6ca3398fa2b1431930faa019ac4b2"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 163088, 384, 2, 32, 1, 2, 0, 0, true, false, false, false, false, true, "531318ef86dcfb45acc43244c0ea567be971df6a1eef148eee761b43f609aa1e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen", 157504, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, true, "5eb14133513e270200c8262736095664df024c1d9ab31bc39a8a7c55b35e92da"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 156304, 384, 2, 32, 1, 2, 0, 0, true, false, false, false, false, true, "bcb4e4e8033ffe3021708c0929662d3cd4182b4eb2cc0ff13bc2d072e6a5dbc4"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 214208, 384, 2, 32, 2, 3, 0, 3, true, true, false, false, false, false, "dc1495d5e0d2fa7015619a63af2f0e78fad5b6fed9458ef1396c79a48231607b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 195256, 512, 2, 32, 2, 2, 0, 3, true, true, false, false, false, false, "4d9806b26c07a568cada7c6c2ff2c4b205af27532537e72c2b6e33ebde063296"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 195256, 512, 2, 32, 2, 2, 0, 3, true, true, false, false, false, false, "bfe4276f6a42c5fb310d8d58e428b041f79fbf214919d9a055629d731d4e1327"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 32, 128, 32, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen", 208568, 512, 2, 32, 2, 2, 0, 3, true, true, false, false, false, false, "b0185c7ba2d08f8ee7abf1be34b8ab24274d84873d79dfd9b5146ebacc0205b6"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 64, 128, 64, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen", 181440, 384, 2, 32, 2, 3, 0, 3, true, true, false, false, false, false, "79b05be43919b4a180b69c19573d1ab3f01cc851410cebd99821b1ec6571bfbd"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 188600, 512, 2, 32, 2, 2, 0, 3, true, true, false, false, false, false, "bb815a614ba7b2b7cba280fbd04efa849745c4813f55c57f058fbfec1ab4dbee"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 196376, 384, 2, 32, 2, 2, 0, 3, true, false, false, false, false, true, "cd4d5ff631aefeacae4c28217262b06b1a9d66e02a57a55a42d1bae6dcc34fb9"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 189592, 384, 2, 32, 2, 2, 0, 3, true, false, false, false, false, true, "d13cf88d391bb3e799c9ef9e8cbd5fdb8302c50279e3b441559e639091e776ee"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 214192, 384, 2, 32, 2, 3, 0, 1, true, true, false, false, false, false, "6582f0553b456b8e0d60c3fccc2aa3ac2a254ac77c3dcaa00326fd24d0bcdb8c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 32, 2, 2, 0, 1, true, true, false, false, false, false, "f7b6a990afbdcae5939dfa1fc0e4cd6b66cae524716af110a38f181abc0f9b31"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 32, 2, 2, 0, 1, true, true, false, false, false, false, "e300bd3163935c3da490f218cfa51f0e7df03ca754c574d506085623ff8cb8cf"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 32, 128, 32, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen", 175280, 512, 2, 32, 2, 2, 0, 1, true, true, false, false, false, false, "d8b81d09d54c7c7d731a6f1621ac4410b0f5ea0da6067ad1c1c85250cfd5742e"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 64, 128, 64, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen", 181424, 384, 2, 32, 2, 3, 0, 1, true, true, false, false, false, false, "bf0d6c99a2c780297cb4d7b3678c56d918c2210cc23807358532f0f4c6e3f54e"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 32, 2, 2, 0, 1, true, true, false, false, false, false, "7a06610a50db97c3082f12c571bf43bab52f8482bdaa0e8990dcc063d6ea1da7"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 32, 2, 0, 1, 0, false, false, false, false, false, false, "4d2fa688f6e70fc6f6d9bff96981a45b559775748995229cf861a9d2f2a8e405"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen", 214272, 384, 2, 32, 2, 3, 1, 0, true, true, false, false, false, false, "6a0df77c75333339875653a861bcc5b848c7e072fda84a2b4cda5d14282706a5"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 163088, 384, 2, 32, 2, 2, 0, 1, true, false, false, false, false, true, "45959bcad770bba16aa77d661c587cca1cc09727a32ca3364c1c2a100cf8c5ed"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 156304, 384, 2, 32, 2, 2, 0, 1, true, false, false, false, false, true, "1e0917d740dc1cb6eada1d83951e5c72bc87c011fb7e72cb83863486d4a80669"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 214352, 384, 2, 32, 2, 0, 1, 0, false, false, false, false, false, false, "fd34feb1c9daa648ab940ddb8645ea9567452e93dd60cc57a89275ea6ad84d97"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen", 214352, 384, 2, 32, 2, 3, 1, 0, true, true, false, false, false, false, "2d2d59737ee8d4e08b60044ba2164765ee576149a90c2e8e8d8ad0e721aba8d3"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 2, 0, 0, 0, false, false, false, false, false, false, "b7414d53c195866eda3ee9f0448c869fb6fba750740c75c1dbabfa907bef50e3"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen", 214176, 384, 2, 32, 2, 3, 0, 0, true, true, false, false, false, false, "f178d678cefbd55b7a053397968329f25b733bc7de7b43153c5b0e36b564f91d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 164112, 512, 2, 32, 2, 2, 1, 0, true, true, false, false, false, false, "5b911ba01b92d5147f8e8fa57e20a03689738f1e60f24acc9680e33b3e439a32"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 32, 2, 2, 0, 0, true, true, false, false, false, false, "102adb5ed9d1a9215b575813c5cb9519db6988db4033d84b0e3f2ca810c83730"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 32, 128, 32, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen", 179472, 512, 2, 32, 2, 2, 1, 0, true, true, false, false, false, false, "5ceb4cc85e6e9ef3ccbc90061e89094258d1763a535ca927eeb677d40f9c74ad"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 164192, 512, 2, 32, 2, 2, 1, 0, true, true, false, false, false, false, "f467cb7a0ebea64bce09f97e0abfddd74d2cd3c2fae3dc3f86ee9a47f56891b1"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 32, 2, 2, 0, 0, true, true, false, false, false, false, "813d4d5b7075679362ca49c76524e3f21fd3c2b7e80f5a532a08e251cb4c9a74"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 32, 128, 32, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen", 179552, 512, 2, 32, 2, 2, 1, 0, true, true, false, false, false, false, "355e33cdbcdc512d16e9c2b945f994816b022ab8abd4d62c9beac356fed1c5ca"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 32, 128, 32, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen", 175280, 512, 2, 32, 2, 2, 0, 0, true, true, false, false, false, false, "11b4318ff70bc97f60d209b439346d4fc9364271bd68a03f87827a2026c25f2b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 64, 128, 64, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen", 181504, 384, 2, 32, 2, 3, 1, 0, true, true, false, false, false, false, "22cb4404bb80edeb44200f997f3ff460813dcae65fc0894688bfa6788b1067d8"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 64, 128, 64, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen", 181584, 384, 2, 32, 2, 3, 1, 0, true, true, false, false, false, false, "47e67763a6e8a5b6f6d9913679ec670caa875013398a030c8acbb7c127fa0f04"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 64, 128, 64, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen", 181408, 384, 2, 32, 2, 3, 0, 0, true, true, false, false, false, false, "b4970a634bb08aef91c68fecdc98b192bd9ee72d0661e80026c950bf2c68aeec"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 156432, 512, 2, 32, 2, 2, 1, 0, true, true, false, false, false, false, "4b04db712f28e04482c0ec681dfd4e6aa49a0aa352a3f8860727f14037ac974b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 32, 2, 2, 0, 0, true, true, false, false, false, false, "c6a45485c99921bf14a137c9c58351bf9e3902f89d7136f4430865af9e393625"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqQ128Kv128PersistentContext", 41296, 512, 1, 0, 1, 0, 1, 0, false, false, false, false, false, false, "55ee1a22bb0ac9985344599a9c8bac50b213d2f5eb1c94496fc5b372d06845b2"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 156512, 512, 2, 32, 2, 2, 1, 0, true, true, false, false, false, false, "effb4be626f6694c6bf9a226dd4e34d3a0df2f0f36384b3b9462af8afbaf4d31"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 32, 2, 2, 0, 0, true, true, false, false, false, false, "634eaa52b12741dd64ea3e0fc0ab39bf01b6d07fff3cf7f0c9e2f1871c3bc8f1"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 214368, 384, 2, 32, 2, 0, 1, 0, false, false, false, false, false, true, "797b16c80188148c770e3f6c0eb22d944ea956a2f9fb144bf2d0bb209cbb645e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 214192, 384, 2, 32, 2, 0, 0, 0, false, false, false, false, false, true, "57a2e1604d0a7fb1713b84058962a7f6a1f210e3a3de1ed927c1df54c72632f3"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen", 165312, 512, 2, 32, 2, 2, 1, 0, true, false, false, false, false, true, "b64a933566977cad3d9965d8329ca0a2f8a15d1777e071179c5f59e70c89a593"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 163088, 384, 2, 32, 2, 2, 0, 0, true, false, false, false, false, true, "3330a72a45771d01e76bda7d0c8b879b6bb9aba75fe98dcc9b5dcd10ddb9ec01"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen", 157504, 512, 2, 32, 2, 2, 1, 0, true, false, false, false, false, true, "e98a047387bde816da5d7d9d5078c7affb90022144e22b702881f869b43dae3d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 156304, 384, 2, 32, 2, 2, 0, 0, true, false, false, false, false, true, "2d575ebeb1244f710cba26dd35c1bd1868633980aeb8acfe7ab559c3a3cd8c19"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqQ128Kv128PersistentContext", 41376, 512, 1, 0, 1, 0, 1, 0, false, false, false, false, false, false, "52764e1c4cb90c6551a184aab128885f2ee901caf897d8fc08a953df27d08b7f"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 1, 0, 0, 0, false, false, false, false, false, false, "bed62eb53fc324576e9a7ed3a765182aa9f5299c612c0abd9b884162e2459bcc"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 41312, 512, 1, 0, 1, 0, 1, 0, false, false, false, false, false, true, "b08427a663a8d08b0c42cb2cdc4a594a839810cbc5a6993bffb19c4b0437515c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 41392, 512, 1, 0, 1, 0, 1, 0, false, false, false, false, false, true, "ea22662ad26ecf04a50aae6e220e9cbcb4abb2b09d956937e00f6d60e6d18f09"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 41216, 512, 1, 0, 1, 0, 0, 0, false, false, false, false, false, true, "c6502b1665f6dd34ad2531b763a4e12bea964866d89c81370b4be59d8ebf6ad2"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqQ128Kv128PersistentContext", 41296, 512, 1, 0, 0, 0, 1, 0, false, false, false, false, false, false, "1a95ed37afa164321b4c3bf6390591f80b5f4c52be86062cab89c2b7c2acfb70"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqQ128Kv128PersistentContext", 41376, 512, 1, 0, 0, 0, 1, 0, false, false, false, false, false, false, "115988e6ce59dd321017e798efe340ed6e01b0803ba3af4c79b445a4803a0ff8"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 0, 0, 0, 0, false, false, false, false, false, false, "16dca3f227ef36970342d197241050fb9b8050dcc3fc46c32ede5acef3a310f4"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 41312, 512, 1, 0, 0, 0, 1, 0, false, false, false, false, false, true, "84391ddb9b3fa77a624109563a53d166899ce36a26b8dae0fabff859473491cb"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 41392, 512, 1, 0, 0, 0, 1, 0, false, false, false, false, false, true, "77199e994b16fc799cd5d932d9176fccf76c2fd816c7796491b6fb700d509d1d"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext", 41216, 512, 1, 0, 0, 0, 0, 0, false, false, false, false, false, true, "0508aa6757d4c0cb3bd7705b4fe3df935aa8f7032f3b8b52add71533584d9636"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 41296, 512, 1, 0, 2, 0, 1, 0, false, false, false, false, false, false, "88bfa2ed837ba67133f866a21f7026a89304eae1342aaabc243308ade72c5562"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 41376, 512, 1, 0, 2, 0, 1, 0, false, false, false, false, false, false, "5d701c1f05302541d6172295eccfc93a2f41f609607acf34a49e58580109c693"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 2, 0, 0, 0, false, false, false, false, false, false, "a9fbab3363b38990f7b4f6b9b8eb748ad4106b68974cf1eff31cf12629b174a1"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 41312, 512, 1, 0, 2, 0, 1, 0, false, false, false, false, false, true, "0c002f69b870de4da30f80de5476b8b24f5e2ccff26f35d2ce684f1c03688e85"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 41392, 512, 1, 0, 2, 0, 1, 0, false, false, false, false, false, true, "2c3096db93cbf1e9c56ee9d0f13c65a7985c6c9d8b4916a470dae582246a2814"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 41216, 512, 1, 0, 2, 0, 0, 0, false, false, false, false, false, true, "aa8f3ebdca3c52976ddfc14e8965afe00936f6168c96e79334da796cd4bf0514"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 157008, 512, 2, 32, 1, 3, 0, 3, true, true, false, false, false, false, "58b8ce42254aab47212b5a30e1b9b538dd4444f6c07fa8f8df80ed2baaabbc4c"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 190792, 512, 2, 32, 1, 2, 0, 3, true, true, false, false, false, false, "c8a80b672c43aa599f0edb13f025322d2029b202ad2cb7a64d416fa570a050fd"}, @@ -5078,83 +6150,83 @@ static const TllmGenFmhaKernelMetaInfo sTllmGenFmhaKernelMetaInfos[] = { { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 32, 128, 32, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen", 163136, 512, 2, 32, 1, 2, 0, 1, true, true, false, false, false, false, "86829c72d7b4783b939fa6854688788a700aba23655a4dba9d36967ae89622ea"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 64, 128, 64, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen", 152896, 512, 2, 32, 1, 3, 0, 1, true, true, false, false, false, false, "6848756cb2fd4b430ee327996ec1ee83a6e0099f89ee0164a664877e0b6dcdb5"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 32, 1, 2, 0, 1, true, true, false, false, false, false, "f79d53b298ac2b4631a7448e07fcbe4bc393dbe568beb2ca4aece96efe0d91c7"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, false, "68395837ba0b0606b9b5ab336f39ad43dc94f1f9b33e68cf0d9ff0954f708848"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen", 165280, 512, 2, 32, 1, 3, 1, 0, true, true, false, false, false, false, "b42945be4bc62f03a5d360ab33d4c3d10c37d5e0dcec609f82399fa74b46c246"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 42240, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, false, "8a934c3666eef4f61e0a75b44703c9eec1778135bcea10d20c4cdbfe1f26c7c7"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen", 165360, 512, 2, 32, 1, 3, 1, 0, true, true, false, false, false, false, "5f6c692f97d439c4664cd49f2d8a7a928164e7385fb2dbcfedc4e6fcf76a4b9f"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 1, 0, 0, 0, false, false, false, false, false, false, "b0eb470c9d9f890b6c308215052ab7fd94ed560b52a65c81b874dc1b7b084806"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen", 156976, 512, 2, 32, 1, 3, 0, 0, true, true, false, false, false, false, "060e3165a487d8d51fe5125fbfb4a4da374362c4d35d811156fe0876a077fa87"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 157088, 512, 2, 32, 1, 2, 1, 0, true, true, false, false, false, false, "ef78ed7f383994fb8f7ca98e5a527c7d9e7ed88f40e7be76a3c7f53210caaba2"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 157168, 512, 2, 32, 1, 2, 1, 0, true, true, false, false, false, false, "63e7ae4dc3f3e13cc28ce53de3d1f47e2619b7bc2aff5c3ad1fe098d536e22c0"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 32, 1, 2, 0, 0, true, true, false, false, false, false, "93254df545c23eb6cde50b135bf8d1c9fffe315bedd49ee5263073ed23a4c4c1"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 32, 128, 32, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen", 165280, 512, 2, 32, 1, 2, 1, 0, true, true, false, false, false, false, "2103d6e660fb6161b35a2c0b64e4aee98cb1d04ff5f0007895dee9c0a228971d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 32, 128, 32, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen", 165360, 512, 2, 32, 1, 2, 1, 0, true, true, false, false, false, false, "d0a3300ae6cf2981016978bd07e60485d5353bf4acbfc27f549283ff1024a4f5"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 32, 128, 32, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen", 163136, 512, 2, 32, 1, 2, 0, 0, true, true, false, false, false, false, "11c1cd490df31f3d749774fea7570dc8c64ea72056e8cb898a3dcd989239d324"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 64, 128, 64, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen", 157088, 512, 2, 32, 1, 3, 1, 0, true, true, false, false, false, false, "10e691cab68da5d8ba3753f60c693be378984accc66264766b77306546f681b8"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 64, 128, 64, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen", 152880, 512, 2, 32, 1, 3, 0, 0, true, true, false, false, false, false, "14a6664ea477d882541f8f9d0cf8be3bdac8977460d69a72250e5fe6c734b1f9"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 152992, 512, 2, 32, 1, 2, 1, 0, true, true, false, false, false, false, "460f8745aef0c6b75623090669d728f232e51aac614e4093e4f5d25d209cee8d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 64, 128, 64, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen", 157168, 512, 2, 32, 1, 3, 1, 0, true, true, false, false, false, false, "21361ba684f11ec51b721b3671514634cff74141ca074be996a0fd12b7a04be2"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 64, 128, 64, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen", 152880, 512, 2, 32, 1, 3, 0, 0, true, true, false, false, false, false, "8471dd6e1c36f95e3a1405e783e780b7fe8a72d6e9d6cc51a533f2e0fc8cc733"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 153072, 512, 2, 32, 1, 2, 1, 0, true, true, false, false, false, false, "033269134bcfd42358f3ccc299266c2eac9a60f9079a73de2c67030eda58d04e"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 32, 1, 2, 0, 0, true, true, false, false, false, false, "ff857566485ba17b0ddec2862c4bac812c40ce28cd7bef9be56af9e4cc5ca153"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 42176, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, true, "84679499e9a48d12bd7d2c981de719f17a5f656316ade74f017cac244a5dc2b5"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 42256, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, true, "29d02c059b32afeae1a8b85a5654b3de75545886a803e26c82c12733ee7b5e57"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 42080, 512, 2, 32, 1, 0, 0, 0, false, false, false, false, false, true, "43bb95265b9c6a7227dc90917dbe164cf42ad4d5af76d85915b0f63fe3255269"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 157008, 512, 2, 32, 3, 3, 0, 3, true, false, false, false, false, false, "20c439048dbfb8ccaa25f9c3ba530d51d6b2b3fc3bcc0ade0872614757c2bdfe"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 157024, 512, 2, 32, 3, 3, 0, 3, true, false, false, false, false, true, "e5d2778ccca71a75f90214465862c4b1a3d7371f871fc4748f90027bdecd9e39"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 156992, 512, 2, 32, 3, 3, 0, 1, true, false, false, false, false, false, "6ef6d3cada7d81cbf9bf3f96bec0231b8e630d9ea4fff3f80a16adc577fdcafc"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 157008, 512, 2, 32, 3, 3, 0, 1, true, false, false, false, false, true, "077a40d07c573b1ca0cfaa50ad3a3707d69a40f43d4eb85f32f88c9e84e31014"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen", 165280, 512, 2, 32, 3, 3, 1, 0, true, false, false, false, false, false, "429d56d32db747681f129386586298912fc6640812cc2f9e82c69d959c4e78c3"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen", 156976, 512, 2, 32, 3, 3, 0, 0, true, false, false, false, false, false, "69f4b40be38677eac26dea9f24979474e2346e9c94bd6e746c44388c6455e2e7"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen", 165296, 512, 2, 32, 3, 3, 1, 0, true, false, false, false, false, true, "bbd354658224fa563580ef2a921ba31cd4929efc9eda6d46293a301451635864"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 156992, 512, 2, 32, 3, 3, 0, 0, true, false, false, false, false, true, "fa47d5e6c38f0fe67c9026ca62a638ba3ca6d06e23cc59feaafda160c64ec05d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 191912, 384, 2, 32, 1, 2, 0, 3, true, false, false, false, false, true, "c89afbd1ce5934bf380fc843352042471e6a91d3dd70944c048381b4c97a9421"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 188200, 384, 2, 32, 1, 2, 0, 3, true, false, false, false, false, true, "e65d040a62fe9b85c1052918fa9b85bd7b4715e1b586382b6a023713ff21147c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 157088, 384, 2, 32, 1, 2, 0, 1, true, false, false, false, false, true, "4b8c8f39c429a1c9e6b5233efeb9baca652a7455d09811a811088a3e832a9662"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 153376, 384, 2, 32, 1, 2, 0, 1, true, false, false, false, false, true, "4e447484ab99e583a4cbc7ae9eabb4d49e18c8840e83983494f5a6a5c27f3d46"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, false, "fabc1e1bcde4c857e370c48f23786ca4ab82f5f4bc3399e362c646c0708d21bb"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen", 165360, 512, 2, 32, 3, 3, 1, 0, true, false, false, false, false, false, "66774732ab7e7f08520dc47f5e1449a28b08b03508a6f710915aba970ff6405d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen", 156976, 512, 2, 32, 3, 3, 0, 0, true, false, false, false, false, false, "350863ec6256840fbea35fee0f2177b03e7c4cbfa95ec4da65de0e24af8a13c6"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen", 165376, 512, 2, 32, 3, 3, 1, 0, true, false, false, false, false, true, "39139f6a4c9f170ba153babc59a67ba32dd98b7e0289e154be49ef734c1f869b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 156992, 512, 2, 32, 3, 3, 0, 0, true, false, false, false, false, true, "820d337664894ca143dc3ea9d82705b907dbc021996c8c93b5669541bd031efe"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 191912, 384, 2, 32, 1, 2, 0, 3, true, false, false, false, false, true, "cea6247b2e8b6cb827658c4076351ce4ee8e63d93af26c71129f0cf0260a6c8f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 188200, 384, 2, 32, 1, 2, 0, 3, true, false, false, false, false, true, "9d1ddffe8c9429934731b60dd17608e55b4b1987ef4f90effd972330b74b98ca"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 157088, 384, 2, 32, 1, 2, 0, 1, true, false, false, false, false, true, "01472c426f30c0e258180a35ee2d3391511c927ed20219c0bede1d851b10721b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 153376, 384, 2, 32, 1, 2, 0, 1, true, false, false, false, false, true, "a8993824bd384813941080ee1137fc386853d8ed70071a3d1f843c74e77cf657"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 42240, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, false, "1506cc361160e7f7999700bf099698acef4afc00ef312e16bd52ab531900d599"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 0, 0, 0, 0, false, false, false, false, false, false, "d31a8ebece5aa6262c873bfe26c45101b1a75e2a1ca0fff6dc7861ae2a9870f7"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 42176, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, true, "f3be97000053e307bc02656ea0635bdb5681ae22c94c28cae7ba051d2395c7ed"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 42256, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, true, "2eb981e5daa4aaa9ff7a3cd4ad0a4faaf3a7634e14e5b39198da0130b25b5738"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 42080, 512, 2, 32, 0, 0, 0, 0, false, false, false, false, false, true, "01bd27dacef6ea8641281760959ae777d8b5f8c6d5f99330388ec0bb0560542d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen", 158208, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, true, "9a33edd6f5cbd6a8ec829b7f0698c58684ef892643d7eefb075dc57a64196fdc"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 157088, 384, 2, 32, 1, 2, 0, 0, true, false, false, false, false, true, "a8649d9798c59f7b609132714c891d6ab3cee669281837402ebab593a783dff7"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen", 153984, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, true, "0c57433e9bd4acf248212deab020e965c53df20c0b00ba2a3b6289522905dad4"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 153376, 384, 2, 32, 1, 2, 0, 0, true, false, false, false, false, true, "429699cbc285e803d51df99c3511ac04a475a219b246427bb42efb3a761ea726"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen", 158288, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, true, "8c05e9591a0f493de95c672bb722d84ea002550ac202fa1dea9333ca01096e5e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 157088, 384, 2, 32, 1, 2, 0, 0, true, false, false, false, false, true, "8d57507b8ca88a9c51248967ae472bbafbd20b51ccd5f86ae16614e6978ab0cd"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen", 154064, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, true, "f5a04476aafe5c1f6c22f0fc8f3841270ee844aad1aba559bdda925f3b293b80"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 153376, 384, 2, 32, 1, 2, 0, 0, true, false, false, false, false, true, "34042b697175d4b9ec939dec39c17878c1c1fa68f84c0e53d0ef0dc53446e087"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 157008, 512, 2, 32, 2, 3, 0, 3, true, true, false, false, false, false, "3c082714e0ebdf13c15f7a9ef8daa174449c45849bf20a33d817b101e4a25546"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 190792, 512, 2, 32, 2, 2, 0, 3, true, true, false, false, false, false, "f52460e2294cd05b86a5ca435fe9e4e2b0bfbc29021bea40f9260c30cb4c5ae3"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 32, 128, 32, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen", 197960, 512, 2, 32, 2, 2, 0, 3, true, true, false, false, false, false, "4fdac7373e3a19ecede4609502dbc6f4be59c9e2ef6fbbbb6e457a789aae95e8"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 64, 128, 64, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen", 152912, 512, 2, 32, 2, 3, 0, 3, true, true, false, false, false, false, "a627b2d6801e0f052d47efa82d71482da565de8cfac92d3993bbad582c6c4351"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187208, 512, 2, 32, 2, 2, 0, 3, true, true, false, false, false, false, "68beafcd0607ab2415c6559165deecad4ffb7e2ec75e47a424adb3ab3f54fb56"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 191912, 384, 2, 32, 2, 2, 0, 3, true, false, false, false, false, true, "25eca5c3ceed2cb73960b6c1e408f03177d5777ec3a45d32982e119021cee5fc"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 188200, 384, 2, 32, 2, 2, 0, 3, true, false, false, false, false, true, "045bb09ca39507ca788b62d17733072575b0679ec4cd579e4355ff09471ec9dd"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 191912, 384, 2, 32, 2, 2, 0, 3, true, false, false, false, false, true, "7f99a6852e32352b1b6969c782cef5d64e800529ec0feb986722c0c3cf0bf46b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 188200, 384, 2, 32, 2, 2, 0, 3, true, false, false, false, false, true, "116f2b2ba457a68e0b6ccb89719c86c885a68be1416d7940a2fec09483f5a236"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 156992, 512, 2, 32, 2, 3, 0, 1, true, true, false, false, false, false, "60d66ecaae105d3d689e23112ccc3b66ee788b7a935fe43a29111981c129f769"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 32, 2, 2, 0, 1, true, true, false, false, false, false, "47e71041f91554173f1a2ee75869d04d785e8c805880cbaa5f6929bc4796b461"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 32, 128, 32, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen", 163136, 512, 2, 32, 2, 2, 0, 1, true, true, false, false, false, false, "6629865ffd2146a01581f6703f221657bed964c7274cac9e5fc5df6efaad1194"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 64, 128, 64, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen", 152896, 512, 2, 32, 2, 3, 0, 1, true, true, false, false, false, false, "97f0347d3f6cf0881140686e0507fc2aa747e14bbf4e909f657b9eb7b9219917"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 32, 2, 2, 0, 1, true, true, false, false, false, false, "bc6dc8e6f3091d29d0bac267066bf44df8dcf3e8dee9c21c92a48d02604b450b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 157088, 384, 2, 32, 2, 2, 0, 1, true, false, false, false, false, true, "6e2bae73dcbeb55161ea31a7c4850656178c31abeb77a627ed1bf5cad23daf77"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 153376, 384, 2, 32, 2, 2, 0, 1, true, false, false, false, false, true, "3354684921eccf0faa39fc56caea730a56898e719a2f13598f5ab229d55775df"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 32, 2, 0, 1, 0, false, false, false, false, false, false, "b1f8065cbbef0893ab6dc358ea94dffe8d31da3998eb1e5c3973f04ccbc96428"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen", 165280, 512, 2, 32, 2, 3, 1, 0, true, true, false, false, false, false, "16fb71bd3dd4476f046490f83a03b8f21f20f3a466f29c0fdab4b3bc9b6e41b2"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 157088, 384, 2, 32, 2, 2, 0, 1, true, false, false, false, false, true, "dc49b2feb99eefcdcdf45b957883a5d833cc257ee54bb2a2ab6c60761eb9bbdf"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 153376, 384, 2, 32, 2, 2, 0, 1, true, false, false, false, false, true, "909f46d0467a98662fbaa6ab899f7433d1edc602eaf99d0edfeab11206783b9a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 42240, 512, 2, 32, 2, 0, 1, 0, false, false, false, false, false, false, "a35e2e5abeecda300108632b62a01da813e0d5045027024545648f4c6cc5d83b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen", 165360, 512, 2, 32, 2, 3, 1, 0, true, true, false, false, false, false, "a6ae5c717d3999e01419423fd53aae33292e86c7297e39a7cdd109ddd0877bd0"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 2, 0, 0, 0, false, false, false, false, false, false, "aab3c8aae2af705b882704b8d3fed1702f6621855701608216f03eb471335437"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen", 156976, 512, 2, 32, 2, 3, 0, 0, true, true, false, false, false, false, "c4d744456e9468ba96c10bb902496b20499387ee4f0e099437abb6e192a3f16e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 157088, 512, 2, 32, 2, 2, 1, 0, true, true, false, false, false, false, "7ff4994de8599c95fab891365cfbd648c3b121fa943fa4708109b35cfd977e14"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 157168, 512, 2, 32, 2, 2, 1, 0, true, true, false, false, false, false, "e45da99f530088e18bdf80867ef49ea7180f9c4330d35006adc8719984808564"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 32, 2, 2, 0, 0, true, true, false, false, false, false, "34a083da990dab5b9645792586226501fe1fa88a06387385b02158094f5ac61a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 32, 128, 32, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen", 165280, 512, 2, 32, 2, 2, 1, 0, true, true, false, false, false, false, "f7d82ebe55f0e9c3c8a68ac64a47baa54b5fc3be5a19adc9f79fdc2c1b5eb3c1"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 32, 128, 32, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen", 165360, 512, 2, 32, 2, 2, 1, 0, true, true, false, false, false, false, "04e061cf1e1683ea410d17b927435112b7a5de2e2e6a29b4260bb588211d81f4"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 32, 128, 32, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen", 163136, 512, 2, 32, 2, 2, 0, 0, true, true, false, false, false, false, "f34a019029fae40e140b026b83611f2734163a1998a960a9a39ecc1f07dd5af5"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 64, 128, 64, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen", 157088, 512, 2, 32, 2, 3, 1, 0, true, true, false, false, false, false, "9c79db3e0e7cd082e8437e15144934622499eaf085d62d57fb5ee630879c4faf"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 64, 128, 64, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen", 152880, 512, 2, 32, 2, 3, 0, 0, true, true, false, false, false, false, "4faaea0d887dfc3be52d14fbe009925d053e845c6d4dd5469f184267023a35a5"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 152992, 512, 2, 32, 2, 2, 1, 0, true, true, false, false, false, false, "0d95a08aa96ef5a5204acbef151a039bbe6f3bdd0ab284b220c033c6ed8175c7"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 64, 128, 64, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen", 157168, 512, 2, 32, 2, 3, 1, 0, true, true, false, false, false, false, "9b8063e29abf3adfa4a7f38f50614ee439eea0b0c3837b430a054e8f17e1294a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 64, 128, 64, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen", 152880, 512, 2, 32, 2, 3, 0, 0, true, true, false, false, false, false, "835e74adb78f489f2ac41053a01f2b47f5e23398d4bdf9f441cf60303939f256"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 153072, 512, 2, 32, 2, 2, 1, 0, true, true, false, false, false, false, "cc45347c891d3618db570baabdf7c88dea2f425db1436da723a33621ee243490"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 32, 2, 2, 0, 0, true, true, false, false, false, false, "eeb163af167663f6b2fbbaa8cc963c525b03ae4b293fdb196c76aabf48842e5e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 42176, 512, 2, 32, 2, 0, 1, 0, false, false, false, false, false, true, "7d43b58fc0c683990a6188c9ac113cd9fc901bcddded9f41a5012edef6814389"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 42256, 512, 2, 32, 2, 0, 1, 0, false, false, false, false, false, true, "7647b3ca4eed54abec017ce56d7b55dc0f3c335000cc3d282bbe7f29836247b4"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 42080, 512, 2, 32, 2, 0, 0, 0, false, false, false, false, false, true, "cd9816a2f704ac16ef863c8ecaaed3bd6520286b0c433d69dfdb91817dd593fb"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen", 158208, 512, 2, 32, 2, 2, 1, 0, true, false, false, false, false, true, "373d991d08070c3fbbf6b3a45dab0c5fe75214484061d238b86957c6d4a0f19d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 157088, 384, 2, 32, 2, 2, 0, 0, true, false, false, false, false, true, "ec08e179fbed89356140f59f7476d82dc06cb3122b716d887683b3fac837ca75"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen", 153984, 512, 2, 32, 2, 2, 1, 0, true, false, false, false, false, true, "11a9424410cdeaab9305860c23f0d277d86f94ffec92493d9baf73a3c2ffe051"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 153376, 384, 2, 32, 2, 2, 0, 0, true, false, false, false, false, true, "1bbfa765287017979a8d5b879649aee845557bef603801349e992b86123698df"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 1, 0, 1, 0, false, false, false, false, false, false, "52e390154cb9374155693a6eec3aada086a07efb68232275c693b42d8098b249"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen", 158288, 512, 2, 32, 2, 2, 1, 0, true, false, false, false, false, true, "a60299aff958c6ce7fcbaf04273443290c65c8e0b28c86dab5c3c53063bbffcd"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 157088, 384, 2, 32, 2, 2, 0, 0, true, false, false, false, false, true, "9f1991aa140ab34312ba7403ea4c8c9bd8ea9376bd57f5db0b79298231daa252"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen", 154064, 512, 2, 32, 2, 2, 1, 0, true, false, false, false, false, true, "611b9aa50b2e528df9af831d464ce2b977d106636278a04441c429394ac90beb"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 153376, 384, 2, 32, 2, 2, 0, 0, true, false, false, false, false, true, "d5cb00a1ce6a1da7a2e6204fc7d60a9a5ca7e160881313799223f5eac82672b9"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext", 82336, 512, 1, 0, 1, 0, 1, 0, false, false, false, false, false, false, "2c2a32ca5d6527cc95e1a64fb9ac95af06d6ac230cf05fa5eff8a343846b34aa"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 1, 0, 0, 0, false, false, false, false, false, false, "bc5354b7883370beb668389c815b54bbe0691cf6e300e3f953eb97ba4be426aa"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 82272, 512, 1, 0, 1, 0, 1, 0, false, false, false, false, false, true, "64292f32f9b9f82488afde6d8296bf6ccc4c11bbf910f7343dd4714c5c4a0bf3"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 82352, 512, 1, 0, 1, 0, 1, 0, false, false, false, false, false, true, "e3a8eed426a1738046b78762535a61adb35e70332f69056bac3d2601efb23e5a"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 82176, 512, 1, 0, 1, 0, 0, 0, false, false, false, false, false, true, "4ffca8813e64203f0a946c47bc355abbba2e42d83c7223d957b43361e54cc7f3"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 0, 0, 1, 0, false, false, false, false, false, false, "67c086d95ff7d3929d05011c9615299ae50913b83e15cbfc7389a6501a308ce5"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext", 82336, 512, 1, 0, 0, 0, 1, 0, false, false, false, false, false, false, "c95c99cf26ae42b2470e0fcdd66a95afb7814a74618aaa7a1a57aca854e08f5c"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 0, 0, 0, 0, false, false, false, false, false, false, "2d5d97402d2780d9a46ba990e299fb49f2568ec0d658eb49f3e40c0dca4d59e5"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 82272, 512, 1, 0, 0, 0, 1, 0, false, false, false, false, false, true, "282ea32f790917a0c92a2455765c1f722fe7173641792e4d1b68eb81c9b91b9b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 82352, 512, 1, 0, 0, 0, 1, 0, false, false, false, false, false, true, "6d07f788c649aa81475ebe9a84a6f90e075aaef0e9d252bd1b671c6be44abe9b"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext", 82176, 512, 1, 0, 0, 0, 0, 0, false, false, false, false, false, true, "233528a64df9233e16b654260c5351f3c3b5144ff433bdb72bdae3eb8d9e0507"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 2, 0, 1, 0, false, false, false, false, false, false, "23ff4530b2d2c4797b3d921b033e053d44fd6421c36959237527ec2e24631998"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 82336, 512, 1, 0, 2, 0, 1, 0, false, false, false, false, false, false, "774ab33b6d55d407b84561c25eafcbea8fae7e52f2c03f2cefe41a29953da587"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 2, 0, 0, 0, false, false, false, false, false, false, "7c3479e91c2ee8bcb081ce6fd781bbf45d89e5db0d67dd18ae2dfdfab77e387d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 82272, 512, 1, 0, 2, 0, 1, 0, false, false, false, false, false, true, "a7bf7658c46f89398bf020e1406115273d69ab8729ef366d7e06c4210e111e93"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 82352, 512, 1, 0, 2, 0, 1, 0, false, false, false, false, false, true, "8077b49c223d003be763ef48c80a5c27fc019f7b78949f836b9b3c0662c16f0a"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 82176, 512, 1, 0, 2, 0, 0, 0, false, false, false, false, false, true, "9e35131ff77ec7b7dffb0020dc63bbd7ea031e55659b53dfd4623be0e3ce57d6"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 165056, 512, 2, 32, 1, 3, 0, 3, true, true, false, false, false, false, "88b6902ec38df4d16f1d4c6236ab2bc5b0678f7433a4dfaed9738f765dbb4fb9"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 191672, 512, 2, 32, 1, 2, 0, 3, true, true, false, false, false, false, "bdb9d0bd7a296abbc6256bd4e4af84f874c28f109209a85393ab69800c765605"}, @@ -5166,135 +6238,163 @@ static const TllmGenFmhaKernelMetaInfo sTllmGenFmhaKernelMetaInfos[] = { { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 32, 128, 32, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen", 167088, 512, 2, 32, 1, 2, 0, 1, true, true, false, false, false, false, "b71db8698dff5973b950973e68b8b279e69c887ad22b26d3f72582ee2844c320"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 64, 128, 64, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen", 156848, 512, 2, 32, 1, 3, 0, 1, true, true, false, false, false, false, "7d6cdc1712f2413f16559826a60a837c541e82659a36a7d5360b5b589585cd49"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 32, 1, 2, 0, 1, true, true, false, false, false, false, "d0145d91b47d843a0eaaaec9204057106f2702bdd69dba1075211fabacc604f6"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, false, "059d7b9e2aa3732e31e301f216801c40df40efe19090947364bcbce4e0e57d8a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen", 181520, 512, 2, 32, 1, 3, 1, 0, true, true, false, false, false, false, "ba4e1fcb3ed49f074a41661012c3ca1a6cd1a437ecf3cb396c9708bb81c201aa"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 83200, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, false, "c9306a00f275c3f29dd571b8bf5bf79030a35a19d02eeb0af87f8935a1130470"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen", 181600, 512, 2, 32, 1, 3, 1, 0, true, true, false, false, false, false, "f17b771b76bad89a4588d2bf6c1cadd72f56acfda5adba6bda1043857baa8c93"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 1, 0, 0, 0, false, false, false, false, false, false, "948ce4eec0ed1406694e48a20a8d54ea24740dca9038a3a116607282ff740582"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen", 165024, 512, 2, 32, 1, 3, 0, 0, true, true, false, false, false, false, "6f9af45af5297b7bc2b6cfa6aa13b48b7c621d01acca07369c7e104829f23eb8"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 162064, 512, 2, 32, 1, 2, 1, 0, true, true, false, false, false, false, "590305b99df59eaeb64e9838d8ddb15d1f7c25e03650019d63f7cca8b62cf8a9"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 162144, 512, 2, 32, 1, 2, 1, 0, true, true, false, false, false, false, "f8bf6f4ef393b418ef24cb7960394a9344ca5ca58267f6d55e58bf7ff974d4bd"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 32, 1, 2, 0, 0, true, true, false, false, false, false, "630e7c4775363beff3cc2c2d2bb31c1c90f4d2b74759c9e986f6d76019c6af97"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 32, 128, 32, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen", 175376, 512, 2, 32, 1, 2, 1, 0, true, true, false, false, false, false, "1e7329a0796aa84f234e753035eb6471223f18109979e76559e68686da2bd04c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 32, 128, 32, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen", 175456, 512, 2, 32, 1, 2, 1, 0, true, true, false, false, false, false, "4eb9d8fca93f82c9e1aa77ac487dd15180eeb77704fe159654e9ae7ebedf1929"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 32, 128, 32, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen", 167088, 512, 2, 32, 1, 2, 0, 0, true, true, false, false, false, false, "e0383a409da4154649c48b472912a59aaf7a91955605868549dfbf616033ce4e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 64, 128, 64, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen", 165136, 512, 2, 32, 1, 3, 1, 0, true, true, false, false, false, false, "aa11aad08f16ef8d9897906c6955889d2de3e91c7c8099bfa1f1e0a2138c7fd9"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 64, 128, 64, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen", 156832, 512, 2, 32, 1, 3, 0, 0, true, true, false, false, false, false, "43b3792d70f5c36749031e339d03f793d66e2833b874b78fdc6cfca84444c6e5"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 155408, 512, 2, 32, 1, 2, 1, 0, true, true, false, false, false, false, "3c50debd610b07a6e632a19a163688a76aaad1a0fd01901890b9c1b73e5adcb7"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 64, 128, 64, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen", 165216, 512, 2, 32, 1, 3, 1, 0, true, true, false, false, false, false, "41cc435c0c23ae200a1e59d508ecdff69fba059f5e65e09eb7d4c8a2a9989ae9"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 64, 128, 64, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen", 156832, 512, 2, 32, 1, 3, 0, 0, true, true, false, false, false, false, "c1db7968d45a97bf7d1284463dc44856f06b60a7797f07b6568292483d7ef8c2"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 155488, 512, 2, 32, 1, 2, 1, 0, true, true, false, false, false, false, "4da5775c00e026e1ea9695bea161c3eb9533ce535ccec94de9ec46c3acc5683e"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 32, 1, 2, 0, 0, true, true, false, false, false, false, "dd64720a6b9c317b58114e12f07d89325a381e200a65f4b1982ea7c94fd761cc"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 83136, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, true, "17d19fce0b25341c42ecae011ad4620281ef16ee9d5ff1ad699f2fb2653d47f9"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 83216, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, true, "496c46033715ab17a5e70831c9bc3069bea55a48c16819ddb4e2c5517af183d7"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 83040, 512, 2, 32, 1, 0, 0, 0, false, false, false, false, false, true, "c05a5402e344ddedf942d0b78fec5f1186658394318d78f9b98d85dfe47d651d"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 165056, 512, 2, 32, 3, 3, 0, 3, true, false, false, false, false, false, "5153f8bf9202bfe07b82744ec5989b29f0710517adaf6b7e4d8852097667df0d"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 165072, 512, 2, 32, 3, 3, 0, 3, true, false, false, false, false, true, "00bd2359e7cdc11b760959ca5929ef9fc4931a355c46ba0acde3f5c7123a9da7"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 165040, 512, 2, 32, 3, 3, 0, 1, true, false, false, false, false, false, "c07cfbd40db528506dc2c36fa3aa8dbe6605be0792c7efbc23f5eb5c4de21c62"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 165056, 512, 2, 32, 3, 3, 0, 1, true, false, false, false, false, true, "2017b7672a733f86be0ab0092fc5128ada59803f28747665b9d366a495d78656"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen", 181520, 512, 2, 32, 3, 3, 1, 0, true, false, false, false, false, false, "cd940c91ea58fcfa227472c556fbcabd7c30f26f064426bbbc3840663af04d49"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen", 165024, 512, 2, 32, 3, 3, 0, 0, true, false, false, false, false, false, "b98eea8e354522f6910ca3e9c3f147b6444cf417e565ef553db8ce58cd178034"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen", 181536, 512, 2, 32, 3, 3, 1, 0, true, false, false, false, false, true, "56ee9407de9e1d140f8ad8452f62dcf3382948dcac879220d1cd437653ec7a9f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 165040, 512, 2, 32, 3, 3, 0, 0, true, false, false, false, false, true, "bb29f94a0ceabce659cb59e2a66074da48d7c14f633b3e6eef620b037c07f96d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 192792, 384, 2, 32, 1, 2, 0, 3, true, false, false, false, false, true, "1d3cfc3bc811a94bbda55b69f7caf5127abf70e13581e401030ee1e8004bffb2"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 188056, 384, 2, 32, 1, 2, 0, 3, true, false, false, false, false, true, "a263bcd3f99a51d4748c86343a97eb542458bb14ea347312cadf689c8d1bc37d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 158992, 384, 2, 32, 1, 2, 0, 1, true, false, false, false, false, true, "6be4cb4b2fea78f52522791494c341f7915cc9e1c1c519ea91b93df6bcfe6cfd"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 154256, 384, 2, 32, 1, 2, 0, 1, true, false, false, false, false, true, "92d855fb9391576950dbd2a10bae6c97974e68efb92c63d8f268adfde44a451c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, false, "58b927665ca106630ba46b4f6a33f18dd2476413bfd095107940e5f77711bddd"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen", 181600, 512, 2, 32, 3, 3, 1, 0, true, false, false, false, false, false, "a6190c66005b70a550251feb45dbea5f5e146233076876376e45c2a2f0f1c70c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen", 165024, 512, 2, 32, 3, 3, 0, 0, true, false, false, false, false, false, "f1d1575ac49d8ec38554f36f9a47130352e272e533085f94561a9e49c5f7a59c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen", 181616, 512, 2, 32, 3, 3, 1, 0, true, false, false, false, false, true, "d5ba5b18cde7f8409f0f600ce0e01c85cb5b5e4e497259e42848abbddd509042"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 165040, 512, 2, 32, 3, 3, 0, 0, true, false, false, false, false, true, "ad0276147d90a6627dc7cb97f6893c1e9b115a7b4367807d3ec6b308f259b255"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 192792, 384, 2, 32, 1, 2, 0, 3, true, false, false, false, false, true, "c8a67da9e6f2f750434ebd8e72865b5bb2472cb4957fcf4269fda1bcd567a92a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 188056, 384, 2, 32, 1, 2, 0, 3, true, false, false, false, false, true, "f9b55ea9dd89943ca3003b3f323c472624beb65493bae6b450af1806e7af401c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 158992, 384, 2, 32, 1, 2, 0, 1, true, false, false, false, false, true, "bdf54a02153bf06f551671094d63877bdf43731435e4ea6a02e60bacb62fc62b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 154256, 384, 2, 32, 1, 2, 0, 1, true, false, false, false, false, true, "a46d7f4c7433eb1244596234e36babf40ff24b86db3a4d46936f01d594cddee7"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 83200, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, false, "be54d95e8b0a6f04589c05d523d414171aade0e85330d5f2b87bd50346476a5a"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 0, 0, 0, 0, false, false, false, false, false, false, "82efd043e1a2501c19ecb10d62f8867e3e8f0275afa21e9d808955f101d4b0ce"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 83136, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, true, "ee7bbad3853395aef1f11a4a97c073e84e9819af689d5d285a62895ee5dc26ab"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 83216, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, true, "18012bf245007a146dfb12ca80233e4729031b554c0c5516b2b756d1e509fd41"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 83040, 512, 2, 32, 0, 0, 0, 0, false, false, false, false, false, true, "dda5a368730cf78c1d9d3ce1d0d2c43dec0e2091d9624712a8e89bec908e9a6a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen", 163184, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, true, "e94d4a6ab99b8b05b840aff0b269798a8f9ba4b3c12305800f0259772e2bf004"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 158992, 384, 2, 32, 1, 2, 0, 0, true, false, false, false, false, true, "115ca6f48fe9079cfedf2b9a0041dc0bd5763f9157b70a99bc7ba83ffdd84a6a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen", 156400, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, true, "fcc53bdb14428d102e1290b52b7e07cff850a8c3d52b21c31e3ba4acc4d381e6"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 154256, 384, 2, 32, 1, 2, 0, 0, true, false, false, false, false, true, "07204bbf7f62c4e477eef911c765126fa782dddbce56a9ba5ac0590e412f3eab"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen", 163264, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, true, "c3b7034ea6300f0d910d50e1cffc54d5e137df1216ff227230741ddad0cd97ba"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 158992, 384, 2, 32, 1, 2, 0, 0, true, false, false, false, false, true, "eb12c97a7e0dca1182f3497ebf0d3bad990dd1dfc54c5c24ecbf206c0e09bed5"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen", 156480, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, true, "9db5015e9a97a28a4553349ec220b5593e0018d1b018e2cdc87f7797b2bbc0de"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 154256, 384, 2, 32, 1, 2, 0, 0, true, false, false, false, false, true, "3ce9dc4c5ab1156c5558be8391f8258be89b68bbf4a7636e31c29be090695ca2"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 165056, 512, 2, 32, 2, 3, 0, 3, true, true, false, false, false, false, "ffba7ba81667a7c72684f101acdf8f1f73b5cab8e013ba1ca598c98bc2240a23"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 191672, 512, 2, 32, 2, 2, 0, 3, true, true, false, false, false, false, "6f9662bff6c3c89b7d812e6999a2722cf9d75b9cab242ead8331b57f7d8b18a9"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 32, 128, 32, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen", 200888, 512, 2, 32, 2, 2, 0, 3, true, true, false, false, false, false, "de887712625bfadfdc0d8dab7df9d18d398740bdd2a51b8e74c02daa90db5cce"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 64, 128, 64, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen", 156864, 512, 2, 32, 2, 3, 0, 3, true, true, false, false, false, false, "6f10ed07707136429ab7630892180cecff026972e3d53bfce07992b017bef956"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187064, 512, 2, 32, 2, 2, 0, 3, true, true, false, false, false, false, "fc69679d6263c27b364f1827e32232270df1b3921abf98334ad8083b0a65088b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 192792, 384, 2, 32, 2, 2, 0, 3, true, false, false, false, false, true, "91f5b418eb4a2ca3708c79c46416d1a6c9af735231b5dbfdb7b56e835e212f0c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 188056, 384, 2, 32, 2, 2, 0, 3, true, false, false, false, false, true, "70eb898bbf9c4c8d0f3deb301dbefe0bc77f3f1989bd5c96e8ac179607c3d41c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 192792, 384, 2, 32, 2, 2, 0, 3, true, false, false, false, false, true, "c2b4900a073ad03fdef10e455bf0124162447e054de3758a97cfe6520474a0bf"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 188056, 384, 2, 32, 2, 2, 0, 3, true, false, false, false, false, true, "9629caab45ece43378eb784762779043f129d5250416a1195d27eb0ccd1f702d"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 165040, 512, 2, 32, 2, 3, 0, 1, true, true, false, false, false, false, "ba03e7a89695f93025a4fdcaa2e3e587868a43d34b73beafcc95269dd7ff4c3b"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 32, 2, 2, 0, 1, true, true, false, false, false, false, "92f3f823815b343552ebe2d14e1f9232d7f0f1910fb0b0530be2b506455fcc4f"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 32, 128, 32, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen", 167088, 512, 2, 32, 2, 2, 0, 1, true, true, false, false, false, false, "0efd49e3e1349d20e6b870de03c71b83e12f071ae115340a1e93e5881693414e"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 64, 128, 64, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen", 156848, 512, 2, 32, 2, 3, 0, 1, true, true, false, false, false, false, "fe7ec5b2f993d9a9b98411558af0305c0545c581edd4a1fedc4cbe0770983689"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 32, 2, 2, 0, 1, true, true, false, false, false, false, "86d5179182e28bbde0902f949aedc39a6239b8e53d03d5d99eade090e0d1fce3"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 158992, 384, 2, 32, 2, 2, 0, 1, true, false, false, false, false, true, "12e47bfd2b3cd5146140a0c642945f693aca84829230b138b3f5a7ab02623989"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 154256, 384, 2, 32, 2, 2, 0, 1, true, false, false, false, false, true, "dbf62d96e2c8d916be7d35ba41f8e1dadba50a41beb7b0878bfae469eb04b47c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 2, 0, 1, 0, false, false, false, false, false, false, "e76a7ab2506a796ad9a5fdd75a8a1e6fa9e57fbcbc2a198b5f21af42e58d8c2d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen", 181520, 512, 2, 32, 2, 3, 1, 0, true, true, false, false, false, false, "be3da2d2401d03fda21d0a6029295497ca960ac2bba160c7b9ea57feea2309ce"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 158992, 384, 2, 32, 2, 2, 0, 1, true, false, false, false, false, true, "0c4ac6ceb4c1ca04b3daa27d972fe064d2523f41d55c109f0b5322b846b208b3"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 154256, 384, 2, 32, 2, 2, 0, 1, true, false, false, false, false, true, "0c082f754221a7123bb273398e210ab96262f4f9f163e30b2ea26c23291a1521"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 83200, 512, 2, 32, 2, 0, 1, 0, false, false, false, false, false, false, "2d5f3992a95a30e4e3a6aabc7fcfd549e2bf84acc739669a598d3a70b05e4f01"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen", 181600, 512, 2, 32, 2, 3, 1, 0, true, true, false, false, false, false, "2c13c1a210436faac413cef1cb550d0b9e4dbf34b8fee5de3ea2bda7f732e0cd"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 2, 0, 0, 0, false, false, false, false, false, false, "e373dcb993812177f75ba8fe331dd5ff01586b3bfd4e40ba9176601404c297eb"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen", 165024, 512, 2, 32, 2, 3, 0, 0, true, true, false, false, false, false, "76cc4be37b03d585fc4d461b439751fb28dbfe3e36c6486605338dce9f8d7050"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 162064, 512, 2, 32, 2, 2, 1, 0, true, true, false, false, false, false, "419b5e80580d8d3e38a140bd56743cbd4943896e2d90b624c9614acb41f21742"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 162144, 512, 2, 32, 2, 2, 1, 0, true, true, false, false, false, false, "d0409a216ebf0a9f8f60f3c017360f48e614f97a2ecffd7eae2f939cb58afbf3"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 32, 2, 2, 0, 0, true, true, false, false, false, false, "2e92359ac595c3611900c36243600e26ba255395e75cc7b43b8037d129bb3413"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 32, 128, 32, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen", 175376, 512, 2, 32, 2, 2, 1, 0, true, true, false, false, false, false, "57a4b1fba5b1007fe233d4f850ecc9fe531ea09961e392e1a422f17e73332b80"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 32, 128, 32, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen", 175456, 512, 2, 32, 2, 2, 1, 0, true, true, false, false, false, false, "c6cccaa4390c435c141cf314a39f29460ed8171433f58fcfafc1f328195d6ecc"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 32, 128, 32, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen", 167088, 512, 2, 32, 2, 2, 0, 0, true, true, false, false, false, false, "dc3c084dbcc20734c699227f73acab674975ec20c2b7096feb9cf6d38a549e16"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 64, 128, 64, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen", 165136, 512, 2, 32, 2, 3, 1, 0, true, true, false, false, false, false, "3068d5bf93b577ab3a49373a0b4ed32ecf30a7523d4f9af0cbfb929f29e39ffc"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 64, 128, 64, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen", 156832, 512, 2, 32, 2, 3, 0, 0, true, true, false, false, false, false, "2b04eca0e4331a3d4cd7a6b164133c8bea9713d9c9101d95c65bb8972093fbd9"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 155408, 512, 2, 32, 2, 2, 1, 0, true, true, false, false, false, false, "e2e0e0032bfd10abf70abd74939b480845d6eb932371f3d602c5ada781a1bd52"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 64, 128, 64, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen", 165216, 512, 2, 32, 2, 3, 1, 0, true, true, false, false, false, false, "ff401635139627518890c5d1d5c9de470e6bdc33fae617086d2cb88df3b99ccf"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 64, 128, 64, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen", 156832, 512, 2, 32, 2, 3, 0, 0, true, true, false, false, false, false, "16fe7dc6da9c70abcd8a9da436531cc219c8166b030dd28f156538ff6beaa2a7"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 155488, 512, 2, 32, 2, 2, 1, 0, true, true, false, false, false, false, "4762cc92958cd90515d2a4ae75eb7d6cebe8abadf83bcab89a3ec1b454ec1680"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 32, 2, 2, 0, 0, true, true, false, false, false, false, "24292e89017de8eeb95d8ce5885fb455302ccfd65fc8748aa865b69838c840be"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 83136, 512, 2, 32, 2, 0, 1, 0, false, false, false, false, false, true, "3f6ead951375065c2265f29174eb16c9070a04bf954e7d6c9e5dbaba3f7a0960"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 83216, 512, 2, 32, 2, 0, 1, 0, false, false, false, false, false, true, "d183898aa95590dbd7b6b66ee6105c22d5c7558ceeb20207b2479839d69fe97f"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 83040, 512, 2, 32, 2, 0, 0, 0, false, false, false, false, false, true, "40a0739535cf371672dedd9d13bd3115e8d2c372eecc60bf52fda9b6541a3d34"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen", 163184, 512, 2, 32, 2, 2, 1, 0, true, false, false, false, false, true, "037d660450caf78f9853231b6fc7a6b74066266e46b2c8165299bf9d9168e21c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 158992, 384, 2, 32, 2, 2, 0, 0, true, false, false, false, false, true, "608cff489b3212666d440ffcb135d056a6b843c004624754e717e8a1a04eb27b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen", 156400, 512, 2, 32, 2, 2, 1, 0, true, false, false, false, false, true, "e7524be6bd3d3c5fc04925470654f81396aa05f7e58d824379e1d0ca698bc60a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 154256, 384, 2, 32, 2, 2, 0, 0, true, false, false, false, false, true, "8c27d796694245297910d00d2a7f3653fffebddf8b410c18641a374a6bec9fb0"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext", 213408, 384, 1, 0, 1, 0, 1, 0, false, false, false, false, false, false, "c080658a5b56e3e60d604050b5d0cc2ddcf613b2e1ed4fe849e583bc98dba300"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen", 163264, 512, 2, 32, 2, 2, 1, 0, true, false, false, false, false, true, "4c5d20f8388a229561e4edf9011b802fc47101e6f1ad0913248fbd272e7fc2a8"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 158992, 384, 2, 32, 2, 2, 0, 0, true, false, false, false, false, true, "b844f73ff7950f0b19af176be5574c2cc3f7c650647d64e7baf15bd042c12121"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen", 156480, 512, 2, 32, 2, 2, 1, 0, true, false, false, false, false, true, "ef9a909636bcf0471188d61d4c66f2da06954d5fc35e72def771b53072d5723d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 154256, 384, 2, 32, 2, 2, 0, 0, true, false, false, false, false, true, "f8fc4ef6ba91be434cf2bfad6f2a6231e206c8d990b6e3fee28a656cf3130af5"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext", 213488, 384, 1, 0, 1, 0, 1, 0, false, false, false, false, false, false, "8c350f0fe15ac87f4403713872ffdd76d526f9e5e0efbd100efda4292bf04d68"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 1, 0, 0, 0, false, false, false, false, false, false, "8850d9f0007052c1a06542d66e4e52e373634dfe4fdcb6a40ea2713e8e03d1de"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext", 213408, 384, 1, 0, 0, 0, 1, 0, false, false, false, false, false, false, "201de0be61c324bfb51f0e0517601cc7c81b7944da0d1781f3905108fdd98fe6"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 213504, 384, 1, 0, 1, 0, 1, 0, false, false, false, false, false, true, "08f65ba0c4be17aad31b579e9ba6aa27344a74ff209a4c78a20ed94c16b18f82"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 213328, 384, 1, 0, 1, 0, 0, 0, false, false, false, false, false, true, "d5e741ffd67ee44bdd5d41f68ad6144fc1ead128ecbc9961a386d7f212e3dce3"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext", 213488, 384, 1, 0, 0, 0, 1, 0, false, false, false, false, false, false, "e74b2cfb07ab028854f23799c70f5e1b57d9f1e58ce905f90cc6edcc6a363ca7"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 0, 0, 0, 0, false, false, false, false, false, false, "861bfb2694e2d2f8d49e104e63f33e22d324d8c212f5d5989cb1e719516679d6"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 213408, 384, 1, 0, 2, 0, 1, 0, false, false, false, false, false, false, "c9667dc9ef53300f89f62e53188239218dba8b9b4df1321b7e1d3a9eb28f372f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 213504, 384, 1, 0, 0, 0, 1, 0, false, false, false, false, false, true, "80d3f39aedf8bd391724e2ce468fb90415b7e831f4fc38230c84d66305d9936e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext", 213328, 384, 1, 0, 0, 0, 0, 0, false, false, false, false, false, true, "dbdb2c12e64e8ec0ab1d20baee887aef22ee652211704d1f6be8c88d8d7ce3ab"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 213488, 384, 1, 0, 2, 0, 1, 0, false, false, false, false, false, false, "622b7d9052ca7ec2229b53f68a92c912eaa20ec2d2286ed6c3a0504f78c53b6b"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 2, 0, 0, 0, false, false, false, false, false, false, "fc34dd453a19bb8078a54684342737d9c93538676a7ccd60b390631a4a4f80c5"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 213504, 384, 1, 0, 2, 0, 1, 0, false, false, false, false, false, true, "61192d7aa7669cc8e89607620e452cb9606717e57a58e2f82bdecae9d9d1244c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 213328, 384, 1, 0, 2, 0, 0, 0, false, false, false, false, false, true, "70c7d1062c93f692662c76780fc6ff67eb6dbd81d90c246528ab9db9e5780b43"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 214208, 384, 2, 32, 1, 3, 0, 3, true, true, false, false, false, false, "fe7b8a3e82e0cad42973bb820c29c1925e9eb94d360d0075f293bcf10d57a1bd"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 195256, 512, 2, 32, 1, 2, 0, 3, true, true, false, false, false, false, "00179750151332db6e592074ef1cd287657d53ce699804c7b4be755949b53809"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 195256, 512, 2, 32, 1, 2, 0, 3, true, true, false, false, false, false, "2c7757fd4923e46c74bb3a9f7a8c5b964113e192ebd2928d5428c0016404d370"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 32, 128, 32, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen", 208568, 512, 2, 32, 1, 2, 0, 3, true, true, false, false, false, false, "5923bc91df8bb9f7fe2eb1a34b1c311cf769c8f74987b9b1252639b6b2f75010"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 64, 128, 64, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen", 181440, 384, 2, 32, 1, 3, 0, 3, true, true, false, false, false, false, "a98556e2bf7b2fe95a4503f13303f0192fcea6404348af2b84dd52baea7c35b3"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 188600, 512, 2, 32, 1, 2, 0, 3, true, true, false, false, false, false, "cc360bd4936a1b68c1dc9de4af82999393856dd5269f305ded1d6d89329cb8e9"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 214192, 384, 2, 32, 1, 3, 0, 1, true, true, false, false, false, false, "5cbe9c01564ba1dd08b4d089dc9cfa53cf6649d88cf7eaa342a967f09ff0603b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 32, 1, 2, 0, 1, true, true, false, false, false, false, "c2c1a690df0f18a1bc8231880a65ffc8b2fa3b23d1add3c72980d8c38d74d011"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 32, 1, 2, 0, 1, true, true, false, false, false, false, "b6db979a7f144d6677ad0ea302b68a17d0c97ca416dd8dcada3260d4f0d7e829"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 32, 128, 32, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen", 175280, 512, 2, 32, 1, 2, 0, 1, true, true, false, false, false, false, "d90cddc6f11ca7780c88138db7dcc938401da05d7ed5e328c9808cd16a1f19f6"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 64, 128, 64, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen", 181424, 384, 2, 32, 1, 3, 0, 1, true, true, false, false, false, false, "a5cbf89da7eb09e33be868bda1c0ff9da654fc3238e0df77ad261441442608fc"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 32, 1, 2, 0, 1, true, true, false, false, false, false, "62bd9022ab02839f66b0e9932a849f89ffee01e1a02d525b45a2f6429cd6a59b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 32, 1, 0, 1, 0, false, false, false, false, false, false, "a72f7c4f93e2cfb6b3ba07af851e5c2faf082ddb1ee24051aa1673c6d1f86a25"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen", 214272, 384, 2, 32, 1, 3, 1, 0, true, true, false, false, false, false, "cfb45d88447698abad3d1f59b1d762208c6a69243f660ad79140b3c6cdbd1c88"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 214352, 384, 2, 32, 1, 0, 1, 0, false, false, false, false, false, false, "89b8895b9a5ffd18cd980d990dfcef8d80b9635889c580c55421a049b1f32868"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen", 214352, 384, 2, 32, 1, 3, 1, 0, true, true, false, false, false, false, "0ea78e7622c14d9b92473d02605c85495b4b1db427b26e046715788d764bb1d7"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 1, 0, 0, 0, false, false, false, false, false, false, "60de4caf1849848c97945f79d8a87c8ec72a1c94f8b500824b4e7b3e016568f7"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen", 214176, 384, 2, 32, 1, 3, 0, 0, true, true, false, false, false, false, "a4a7a399e10ae4bc41ecd9c181c3c9cedb1e40c06e4e6fd2ad21a560067bf3e1"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 166160, 512, 2, 32, 1, 2, 1, 0, true, true, false, false, false, false, "8993ef2c5be6dfa0a3a296a42c070841f144421e2008a9ee4a63dc93277ac23c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 32, 1, 2, 0, 0, true, true, false, false, false, false, "a81f0572f73d482f52aa7cdd0ed2e5f419661e66100b9cf495fc1fcec449aecf"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 32, 128, 32, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen", 183568, 512, 2, 32, 1, 2, 1, 0, true, true, false, false, false, false, "48d7176a288b753ccba64bead84585f01c4e019af0a1ec3a4fbe153aa36cfb93"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 166240, 512, 2, 32, 1, 2, 1, 0, true, true, false, false, false, false, "de7176f27dc192ad98a0423305d263fe8c39074fc3f99827b0ae9434cd99d45f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 32, 1, 2, 0, 0, true, true, false, false, false, false, "fb40c4b63d957ca47af3fcb46bfba05d6c19fbcad8b273d5cab494613e4b1a9b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 32, 128, 32, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen", 183648, 512, 2, 32, 1, 2, 1, 0, true, true, false, false, false, false, "ab9f7677ae0210770c2421577454f78f2797a8aaabc999f6ebfa6362fb55e821"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 32, 128, 32, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen", 175280, 512, 2, 32, 1, 2, 0, 0, true, true, false, false, false, false, "6c5b0ee60741c19e099f6a701e0e4d5449591acb93144127f0e975729ce7f1ce"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 64, 128, 64, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen", 181504, 384, 2, 32, 1, 3, 1, 0, true, true, false, false, false, false, "8f9decc99fdc0d390f22b8162f25183b33425e1bf359f81b6ba2a606acd2b4b5"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 64, 128, 64, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen", 181584, 384, 2, 32, 1, 3, 1, 0, true, true, false, false, false, false, "a827c4d1636c57cdff15ee47a00186b9643a77f0595b54e67a2facc37b657da6"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 64, 128, 64, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen", 181408, 384, 2, 32, 1, 3, 0, 0, true, true, false, false, false, false, "ce6807c0c12cbf4a466cfecba64626b1e4df58c546e8d8a754e8f33d3141af1f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 157456, 512, 2, 32, 1, 2, 1, 0, true, true, false, false, false, false, "058c9914bb0e908418daa6c22f9c538b856ce1227505a692b569e1447d4ed3e1"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 157536, 512, 2, 32, 1, 2, 1, 0, true, true, false, false, false, false, "5163eb71c17ef82a6ce283332496e62927076402ecd7b2cdd982a4d7771bb225"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 32, 1, 2, 0, 0, true, true, false, false, false, false, "543ae1dbec7d7d6960c78c87869fcdff64dbef972e495442044b44412feef11d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 32, 0, 0, 1, 0, false, false, false, false, false, false, "dec9ce35c75a31fc58a77a4fdc688079c53a394282ed35d7bed77ea26809f083"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 214368, 384, 2, 32, 1, 0, 1, 0, false, false, false, false, false, true, "0ff1278582016d9a873b0a99a4147beaeee24fce0d3c0915c97de7ba75a684ed"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 214192, 384, 2, 32, 1, 0, 0, 0, false, false, false, false, false, true, "d6ee5f521e91ac981411f0a8c82b39ec1ce6deae765d38cfff53a34233227f0b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 196376, 384, 2, 32, 1, 2, 0, 3, true, false, false, false, false, true, "688e2facdeff2a0731bc0a901b632967b6700c8743d7af5ec309c303176a4cfc"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 189592, 384, 2, 32, 1, 2, 0, 3, true, false, false, false, false, true, "21e5e8da77588ea61ed849a60688da5e47e56992f1ef0f1b8b2e4e0689059dd4"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 163088, 384, 2, 32, 1, 2, 0, 1, true, false, false, false, false, true, "b89c2ccae9fbb7fb691a8047ff39198dd8624237a404e7f746c07be15c02a843"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 156304, 384, 2, 32, 1, 2, 0, 1, true, false, false, false, false, true, "b3506f0629ab6d91f901ae7fcebf2e4457ddb2121ca238e35720e48dff15eede"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 214352, 384, 2, 32, 0, 0, 1, 0, false, false, false, false, false, false, "d578cd932b4f5c918e9438136326f713f7eb505ed6934e10079fc38a0994f400"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 0, 0, 0, 0, false, false, false, false, false, false, "c59e692ff0c37d82bffe97201dfbbe53cdd9286ae57713aae55a879a2e8cad8f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 214368, 384, 2, 32, 0, 0, 1, 0, false, false, false, false, false, true, "245fd8b0a2fdc9c4fb86b220658c91329762a2bf5d1fcd38284793b58494182e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 214192, 384, 2, 32, 0, 0, 0, 0, false, false, false, false, false, true, "bd7d5c408c91f7489155ceae300173a44de67a6d5f97b2b7db0e3ae4ab8f4dfc"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen", 167360, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, true, "e1eddcc209d1fd84c36fa5942098c7d2a4aae7cba08f1e22ead368993d223850"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 163088, 384, 2, 32, 1, 2, 0, 0, true, false, false, false, false, true, "b7d87bc66cf9c2c59b2e1c992461f0b6d681e414310e5759dd156c08308603b0"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen", 158528, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, true, "f5d4df98270f585401c660c7934595387d88c5ded829ac4b710ae6a731021af6"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 156304, 384, 2, 32, 1, 2, 0, 0, true, false, false, false, false, true, "ca465d4ffad7775ec1ba7621b2d118b7213ce0209649ec3db21c4f4593df3303"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 214208, 384, 2, 32, 2, 3, 0, 3, true, true, false, false, false, false, "4f5511c5a8d73c85d56ef689df3ce482fde9177ba90d8f05b9ef7138118609f7"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 195256, 512, 2, 32, 2, 2, 0, 3, true, true, false, false, false, false, "151ddbe1bfc76fe40700cec84bf9cab68410a0e44b422013719e4b8be0c90670"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 195256, 512, 2, 32, 2, 2, 0, 3, true, true, false, false, false, false, "a4b066bfc48858751867560f22716a99b99bc27e6b91fa9962fa5078bd9da33a"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 32, 128, 32, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen", 208568, 512, 2, 32, 2, 2, 0, 3, true, true, false, false, false, false, "8f1cf0397460021de02cca531a1942d4dd67cf2d8c4f62359aeff7b5436518e5"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 64, 128, 64, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen", 181440, 384, 2, 32, 2, 3, 0, 3, true, true, false, false, false, false, "7d20d146c0ee6e9e115607cdd755956ee33d61d83e80b2e0f4c8c50cdf87f787"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 188600, 512, 2, 32, 2, 2, 0, 3, true, true, false, false, false, false, "b803ed713c936aed37d5f28968878f4cb264120639d2002fe3e3039e4fac13a5"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 196376, 384, 2, 32, 2, 2, 0, 3, true, false, false, false, false, true, "89ebdd18aae7bb41fed86b564c4b23fb6229cef8cf953fc8b3e92f46131a5807"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 189592, 384, 2, 32, 2, 2, 0, 3, true, false, false, false, false, true, "e0a8c0e8ff7990440b94f65e9035d983d90ead055a23e73c026e6b1895b10fec"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 214192, 384, 2, 32, 2, 3, 0, 1, true, true, false, false, false, false, "eeacbdd0587a935696f3879ee5769d3cb4818a7f463c2e9750d42d8a2f3d1d45"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 32, 2, 2, 0, 1, true, true, false, false, false, false, "8f20e6f113fecaaec55fd5db0adc32a5972720de2775a8f56dbba91997c567a0"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 32, 2, 2, 0, 1, true, true, false, false, false, false, "455d2421945986782c47c1975db1540ef99393111f0e08a727278ae94b315f5b"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 32, 128, 32, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen", 175280, 512, 2, 32, 2, 2, 0, 1, true, true, false, false, false, false, "7e0e5b0e512f45aded9c73f13d006b0c309a966907a7fc4580da838d92f3764f"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 64, 128, 64, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen", 181424, 384, 2, 32, 2, 3, 0, 1, true, true, false, false, false, false, "103d23e045712847b10b027a16aff50d4a80b5ddf0ab4a9f59a0604bf6260e33"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 32, 2, 2, 0, 1, true, true, false, false, false, false, "29b1a265a847b8cd62677e11c882e8b7a746fc5472345ca3f3642b470c2cbe69"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 32, 2, 0, 1, 0, false, false, false, false, false, false, "d61efd7d74c251c39528cb2e5c07f37b370a56764bf65365e5152495613c4ab7"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen", 214272, 384, 2, 32, 2, 3, 1, 0, true, true, false, false, false, false, "291b21aa22337a1953b936c89a9472e107e68e7a1117d70bcd64677b2d483282"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 163088, 384, 2, 32, 2, 2, 0, 1, true, false, false, false, false, true, "13ce1cf79dfd7ef4254d0c24a382fdad4806bca5ab8f0879ed26c62d1cb4b670"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 156304, 384, 2, 32, 2, 2, 0, 1, true, false, false, false, false, true, "e7a0c8492eda3d4919ee4adde8ce26305608f37b4bd2ebeb72f0fe7f38b317a5"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 214352, 384, 2, 32, 2, 0, 1, 0, false, false, false, false, false, false, "e22e9443b2fbab423d868036a98ede15c92a3e7731f492fcce25ddf3a571122a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen", 214352, 384, 2, 32, 2, 3, 1, 0, true, true, false, false, false, false, "6f7ac0f1fc09a214236dfb9ac5db888fa1a7e2b5bfdf694a91f0c588f652af34"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 2, 0, 0, 0, false, false, false, false, false, false, "3676cf5f52775eb96210628bc6d34cbb1958933ed0276633dea100239e7ce35d"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen", 214176, 384, 2, 32, 2, 3, 0, 0, true, true, false, false, false, false, "677cc2865a2da0df60abe8b2dcfed9f5e35656f155b30d609f44a1e5d78c43df"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 166160, 512, 2, 32, 2, 2, 1, 0, true, true, false, false, false, false, "efdd7d39b89f8fc0d651a31808268abede419663377f028837d258eea1111250"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 32, 2, 2, 0, 0, true, true, false, false, false, false, "56b307339598b5a66ac31df0959523e9320a38f312b2d1a044c5ffa289e941ba"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 32, 128, 32, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen", 183568, 512, 2, 32, 2, 2, 1, 0, true, true, false, false, false, false, "ef33a27d53f40274b38ec9955905caa605695b28650900049a378ddb9349c809"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 166240, 512, 2, 32, 2, 2, 1, 0, true, true, false, false, false, false, "8cefcfe7aaa5db366bc4f4589a5922a9255ecb71ed615e586f62ef36ec429102"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 32, 2, 2, 0, 0, true, true, false, false, false, false, "64d716c3c36021a266fd9f7b41f910cc9ac07b8ed76b9fe5629deb41cc9bc856"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 32, 128, 32, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen", 183648, 512, 2, 32, 2, 2, 1, 0, true, true, false, false, false, false, "820b3c24d8ecdd074f31a38f6994ae083db623b0ec47bcf43bc0539c7cc50608"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 32, 128, 32, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen", 175280, 512, 2, 32, 2, 2, 0, 0, true, true, false, false, false, false, "305ed8c9328be3e3ff6a09cf648abdeec910958d9368c70a9ed9b26eae54b142"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 64, 128, 64, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen", 181504, 384, 2, 32, 2, 3, 1, 0, true, true, false, false, false, false, "c5019edaaaa962236ae2cf40226927e661ca69dbff62a4a85d67ae3c015c334c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 64, 128, 64, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen", 181584, 384, 2, 32, 2, 3, 1, 0, true, true, false, false, false, false, "75b69a1bf5354ca7b66c88e8f0d044f2bcfb39f5c8cdf73c602e0ad54b61f6a9"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 64, 128, 64, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen", 181408, 384, 2, 32, 2, 3, 0, 0, true, true, false, false, false, false, "bee3ce035751b8da0d6d26d9ec3dc6203981de12cbf1d04fb047338311d82f64"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 157456, 512, 2, 32, 2, 2, 1, 0, true, true, false, false, false, false, "354005bfc478eae1cce3fab9e6c708b0586f030d9fc687f3938fab4a31c027a7"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 32, 2, 2, 0, 0, true, true, false, false, false, false, "8e4937a0f341a111590b362d92410b581a717a1751eb0fbc92f563e56f2232b7"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext", 41296, 512, 1, 0, 1, 0, 1, 0, false, false, false, false, false, false, "662f37f651276673b9dc8145b5a049b5a0bb7efe8c2e8eac948bd2c176b2f768"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 157536, 512, 2, 32, 2, 2, 1, 0, true, true, false, false, false, false, "76a938422b978b1340447f362753214169cc68bf6e666516dc9a3b56395306da"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 32, 2, 2, 0, 0, true, true, false, false, false, false, "d39a6e7467c4821baf7dbab3cc584301c1cd232816b497257143f3fefa3e4d9e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 214368, 384, 2, 32, 2, 0, 1, 0, false, false, false, false, false, true, "ca57ef16322355f777d39f6729512b950533611b488577251dcb2707edbf2f8d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 214192, 384, 2, 32, 2, 0, 0, 0, false, false, false, false, false, true, "adbabaf2c525a3efcbb18206b2309e14937000bf0d65de0c88f2745770656e34"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen", 167360, 512, 2, 32, 2, 2, 1, 0, true, false, false, false, false, true, "3fffd7015dbb1024c30695ebdab4280f0ec0ad914867615d97b38f903601324e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 163088, 384, 2, 32, 2, 2, 0, 0, true, false, false, false, false, true, "2d528e6aa16546dee87a90a3652909473eeaf74645b714395999f4d3c6f7b82e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen", 158528, 512, 2, 32, 2, 2, 1, 0, true, false, false, false, false, true, "32a620154a6b25b692fdb9811125981c3dd420393d47528759bf661d074a5d1a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 156304, 384, 2, 32, 2, 2, 0, 0, true, false, false, false, false, true, "0f39a5ae31c3a2cebb9242f4a277544a8c5bd073abf8be320a6a5077362d5d97"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext", 41376, 512, 1, 0, 1, 0, 1, 0, false, false, false, false, false, false, "b163321cae495b976934fbdb9149583de6f6f52ef7aa78fc2ac863e8b8b7d843"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 1, 0, 0, 0, false, false, false, false, false, false, "c3090c70db329e27039c800d8835824fc707c47ce7925e08a61e3fab905d1ad9"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 41312, 512, 1, 0, 1, 0, 1, 0, false, false, false, false, false, true, "a20b792e9f685a05031bb1c6a355d2f7a55482c93a4677db7761b701d42c7540"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 41392, 512, 1, 0, 1, 0, 1, 0, false, false, false, false, false, true, "4f471870d3a63950e91e271ae3ce842b7a79458513139f9ab376a9bee98e705b"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 41216, 512, 1, 0, 1, 0, 0, 0, false, false, false, false, false, true, "04794a79e9ebb94c6a2f296796732eca5f34799543298ef263f075063fd2c1c9"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext", 41296, 512, 1, 0, 0, 0, 1, 0, false, false, false, false, false, false, "1f1f3b8058506612b6b7cc77f174fcc9cc0dd6783af2fbec85a4a967a6dd6833"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext", 41376, 512, 1, 0, 0, 0, 1, 0, false, false, false, false, false, false, "6463472f140c30eaca463e7e49046c97d23d0c4dd56fd44427ad3ab94aac2d64"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 0, 0, 0, 0, false, false, false, false, false, false, "286de0c9f3a5f13dcbb1ace76e6f9bf61afd4aa70a892f48ba2f81351b32f997"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 41312, 512, 1, 0, 0, 0, 1, 0, false, false, false, false, false, true, "3c1d58c0e58bf809b865da0d6c917510935f6ed69d8481aec39d995c0d5e256c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 41392, 512, 1, 0, 0, 0, 1, 0, false, false, false, false, false, true, "048e71a97d23ecc3f75d0ad579864ac20d2a2a900bd9e05d33f7c14c08d92e99"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext", 41216, 512, 1, 0, 0, 0, 0, 0, false, false, false, false, false, true, "c9d4bc3fef194e019b00093b59126b9133fcb498f1acfc97862622cc7ecb3dad"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 41296, 512, 1, 0, 2, 0, 1, 0, false, false, false, false, false, false, "b97491f5dbf38a3df278578610784212e88f8bcdeafe96058b2ae5c1d91cc6b4"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 41376, 512, 1, 0, 2, 0, 1, 0, false, false, false, false, false, false, "6f8273f24b6bc17db6f34f50f345085130fc9dc0afcddb27dd6457fc411d0bac"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 2, 0, 0, 0, false, false, false, false, false, false, "cbacabdb90ac663532d91e343719c33eb9001678884d42275fe84463f7123a50"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 41312, 512, 1, 0, 2, 0, 1, 0, false, false, false, false, false, true, "9d5c5c1bac0f9fd09d45331da53a1f30dc3aa66c34941f518a297e32e2429013"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 41392, 512, 1, 0, 2, 0, 1, 0, false, false, false, false, false, true, "e540e1923eb984a7b8c210417d76b0e01b04488dd77958b9f6b292d42bb7ef24"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 41216, 512, 1, 0, 2, 0, 0, 0, false, false, false, false, false, true, "99efb3294135acb237a1208e8ad421f165e9fdb0e2e9137f6af7a373baa49b58"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 157008, 512, 2, 32, 1, 3, 0, 3, true, true, false, false, false, false, "2cf1877d8a12b00903824ed171b3bb0a08f0fc0c27893a4244a7c8578721bbfd"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 190792, 512, 2, 32, 1, 2, 0, 3, true, true, false, false, false, false, "7d46bfdc5aee775ac69d230eac8ce8bb24929f373f7edf2ccc6d9685fac5923d"}, @@ -5306,83 +6406,83 @@ static const TllmGenFmhaKernelMetaInfo sTllmGenFmhaKernelMetaInfos[] = { { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 32, 128, 32, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen", 163136, 512, 2, 32, 1, 2, 0, 1, true, true, false, false, false, false, "a68b3d4fcd789664b9766f3676f4bde4015eed4982211f9c701459d0b7d1bf7a"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 64, 128, 64, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen", 152896, 512, 2, 32, 1, 3, 0, 1, true, true, false, false, false, false, "1bb9cfff2b3b0928ee442bbab796d56270a8bff23d48f8aef032f8f8b91c395b"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 32, 1, 2, 0, 1, true, true, false, false, false, false, "2195b35fe2c22cc2c5d55c05650c748c94934bc975d472a43c202fe3b04e22a7"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, false, "a3d1d721ca070064376e1a4afbe422939674231c76c99282df2579c91a774dbe"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen", 165280, 512, 2, 32, 1, 3, 1, 0, true, true, false, false, false, false, "4134109fd7be874f2e9bf6bcb5520308d48159c062a1a28f7ead8ef722734443"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 42240, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, false, "90446bbee24dd698e0a2e16b7831da9047386ec8b8ad8188a05e8f30bf2dd463"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen", 165360, 512, 2, 32, 1, 3, 1, 0, true, true, false, false, false, false, "7e56b72b6834217cd2f684298562a8e0d84625641af4dfea7f0a7a5af6a66d31"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 1, 0, 0, 0, false, false, false, false, false, false, "6782f7b86b6cec446e7fcd29b72a19bff2ae345b5054ce12b0fd3d739528ffee"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen", 156976, 512, 2, 32, 1, 3, 0, 0, true, true, false, false, false, false, "55884824ee6a46565cfaa7175240ec49a745736aad1e56abdca1d9d731b51e72"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 158112, 512, 2, 32, 1, 2, 1, 0, true, true, false, false, false, false, "22ce43fea67acd6e3a8ce5ee5441eebcbfcedeeacdcdb00c7498b52a488bcb19"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 158192, 512, 2, 32, 1, 2, 1, 0, true, true, false, false, false, false, "d9455f88fdcb8c1bb3a188fa3e334cf7189930c3e851521940da4f234f9ef6c2"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 32, 1, 2, 0, 0, true, true, false, false, false, false, "cc0762778425a0c0ba4d6ee7cf4e49a7be983a23378a55748ba2978e511149f5"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 32, 128, 32, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen", 167328, 512, 2, 32, 1, 2, 1, 0, true, true, false, false, false, false, "f99d95511bd85a4d782aa2bff58847814fbe958489c26e2e2929f76c0ec87468"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 32, 128, 32, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen", 167408, 512, 2, 32, 1, 2, 1, 0, true, true, false, false, false, false, "6f1dc59f2873a8432d2efd71056a31931965224dd349666d71550b0fbf5fdbab"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 32, 128, 32, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen", 163136, 512, 2, 32, 1, 2, 0, 0, true, true, false, false, false, false, "4a62bb22583822929da7b4dfc433d46621445fc6e3a341e8454c1e3b85109036"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 64, 128, 64, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen", 157088, 512, 2, 32, 1, 3, 1, 0, true, true, false, false, false, false, "c6f1545ffd2adca8344a4b2acc6a6b41dd71e8a67f546835676087d9c4b4d362"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 64, 128, 64, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen", 152880, 512, 2, 32, 1, 3, 0, 0, true, true, false, false, false, false, "75cc8cd396956cfbd7b6f31bf97041ea7cb238c79e2adafa51800a9967f5eadd"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 153504, 512, 2, 32, 1, 2, 1, 0, true, true, false, false, false, false, "acbdfafc944416349503c46a2151c639aab5cf21d9db135d4319bcd710310c1b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 64, 128, 64, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen", 157168, 512, 2, 32, 1, 3, 1, 0, true, true, false, false, false, false, "8e1481dbd980e3a67a9eff13dbdd360a8793411da6820257da9d21f61080d2f3"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 64, 128, 64, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen", 152880, 512, 2, 32, 1, 3, 0, 0, true, true, false, false, false, false, "98b60566cf0b35937604d7dede8e7dd40f5051c60e22e1df35f98f58e9431d09"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 153584, 512, 2, 32, 1, 2, 1, 0, true, true, false, false, false, false, "86ff6dd7d3cb4881bbaee020129f3ebd83fcfd65665998fa0dca85e893741f53"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 32, 1, 2, 0, 0, true, true, false, false, false, false, "beb11e86887cec5765df1241136a24ee6760e157934732ec8ba65df1255a6ae8"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 42176, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, true, "5e38999ed4eaca68715e0c89e49c62bd6ede3da65c01015f5cf8c510dc9bd434"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 42256, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, true, "d7abc53f8341f86e94cf1ed96777b546bc1022cd5bdc7a441d50042f8410a9d4"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 42080, 512, 2, 32, 1, 0, 0, 0, false, false, false, false, false, true, "f13bef05c6eb3942c7cb948b61b703c812bef617ad5d36d5b1df95112d3cfdda"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 157008, 512, 2, 32, 3, 3, 0, 3, true, false, false, false, false, false, "d37091ded08df976374ea9db30c42a70513a41f3e32884bc9a7d556a55f91d6c"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 157024, 512, 2, 32, 3, 3, 0, 3, true, false, false, false, false, true, "7a8d3920ccbd043e577ef39b8a7a4d4a02b784f38cf0f4053660e6eeebecaaae"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 156992, 512, 2, 32, 3, 3, 0, 1, true, false, false, false, false, false, "4c50b4fa54526d94f07984ad84f01dd92b4b6d637c8474a68aba4477da16a64f"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 157008, 512, 2, 32, 3, 3, 0, 1, true, false, false, false, false, true, "009d829ff5daafa9b944dcc62b4cee43b93228eef93d53186c45affe1c050491"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen", 165280, 512, 2, 32, 3, 3, 1, 0, true, false, false, false, false, false, "a614f6698f709d5453e49a4d539b5e112ba221213dad44179ed98038485e524e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen", 156976, 512, 2, 32, 3, 3, 0, 0, true, false, false, false, false, false, "b7a4c16443450d33ecfcd22a9fa61416f4833b2f6d31c36aeca351614362f675"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen", 165296, 512, 2, 32, 3, 3, 1, 0, true, false, false, false, false, true, "297681133f327150bafe85389389e243178ebc5cf25b76aa709f53cbe31e2c8a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 156992, 512, 2, 32, 3, 3, 0, 0, true, false, false, false, false, true, "0e49da722f98c6d93a3e7554b8fe60d0bf1819b7c78a92be1d5e3cab10b0b4c7"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 191912, 384, 2, 32, 1, 2, 0, 3, true, false, false, false, false, true, "038250362aaea691ad67f8393c070cce0bf91f4533fa934fb7e6b4090c2e97de"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 188200, 384, 2, 32, 1, 2, 0, 3, true, false, false, false, false, true, "90f04346c5fbded6dfb0dce5313186e2044a732ff91530dd6f1287f10b231a51"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 157088, 384, 2, 32, 1, 2, 0, 1, true, false, false, false, false, true, "5c3a9bb1128564754c1bf1a5a563f89d212769393de00f18bd8582b8f34068f7"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 153376, 384, 2, 32, 1, 2, 0, 1, true, false, false, false, false, true, "b9878193c454584869dee53532c63285c3ecbcc23153b5dc3c50a849948258cd"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, false, "c499e9006ffde965ee90d3dd8e4abcdd31d0c652b403b25a3d39b7dc579e44c0"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen", 165360, 512, 2, 32, 3, 3, 1, 0, true, false, false, false, false, false, "261d4c4678be82c395de07c119a521fd32fb5d9c87a311caa2cb81ef3a4d4d91"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen", 156976, 512, 2, 32, 3, 3, 0, 0, true, false, false, false, false, false, "191e8b93e214107fdb3a1df3846136e8a1cba0504ac1d5d0ff2bf48ec8598861"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen", 165376, 512, 2, 32, 3, 3, 1, 0, true, false, false, false, false, true, "8fceafefb1dccf42478633fcc4fbb764b7b33a065286dcac6a1f8b1519b68f5f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 156992, 512, 2, 32, 3, 3, 0, 0, true, false, false, false, false, true, "c3778735763f786c030f7bd2c1078bda3c7e6a118171db3fe7449514c409d87d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 191912, 384, 2, 32, 1, 2, 0, 3, true, false, false, false, false, true, "8aa04b03c8e204bd75967191243be80cb55f093f5317787712b922669569ea98"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 188200, 384, 2, 32, 1, 2, 0, 3, true, false, false, false, false, true, "03e6b8e68e0a470556e7a2493df394a3da8317a0d1f70aa1ef30d8dffd7af575"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 157088, 384, 2, 32, 1, 2, 0, 1, true, false, false, false, false, true, "1584d045f918c30644ea8ac69d4c3d692998d8d26eb959f58f56946046595e02"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 153376, 384, 2, 32, 1, 2, 0, 1, true, false, false, false, false, true, "4b50ed51f680f7875e637c53ff052b717c3015a5b43d2c661fd062cb9b54e8c3"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 42240, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, false, "269ecb7ec22167da983fff26f9276f1b678a5ebbfe13c0aebfa177a8736a32f7"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 0, 0, 0, 0, false, false, false, false, false, false, "e3b8fb37b79f9b09e9a4d5809a5fd0cd99cea62eb847ac2908ae2879a7ecae71"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 42176, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, true, "2c059ae6e1e0ae2e18d302b3306423148ee713a6ea2d89ba30573c1b446038bc"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 42256, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, true, "c07d5df1e4c0366ae07ce2f42dc73b2caf7d7eba29c43f68eed5b6f604c1bd28"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 42080, 512, 2, 32, 0, 0, 0, 0, false, false, false, false, false, true, "4e260a4865812d0207177d5dc454b14f82358deb127f42cdb8de04fbd96f36ae"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen", 159232, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, true, "e09058f3274ef5c0feb81f45d8e3d518e365e1623cf3ee49f3193c6fb7593e0b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 157088, 384, 2, 32, 1, 2, 0, 0, true, false, false, false, false, true, "b1dde0f9d5a084a2ce461f3bbe38267874616074b8cc050483a73fe0113d69df"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen", 154496, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, true, "520e959cdb481857bf1a727d122498744eefd08f4086d0d42cff7325e3fdbe91"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 153376, 384, 2, 32, 1, 2, 0, 0, true, false, false, false, false, true, "f536b8a016ed390c24941ff5ebe54b242d16a78fef2a2bc971f3255724856ceb"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen", 159312, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, true, "22799bc43040d986a0b6bd3732ffb4ade9e0218932abb82bed3b421e837afa68"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 157088, 384, 2, 32, 1, 2, 0, 0, true, false, false, false, false, true, "302abe034d8a2760194596c41820191cbf49dc3de6cefc3112a0b1de50fabcf8"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen", 154576, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, true, "5856ab5f6a87975d30f05ac1c70ebc4f6d9d686f1edd14fd3d0a2d1395e7a775"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 153376, 384, 2, 32, 1, 2, 0, 0, true, false, false, false, false, true, "3eb5f55bf348e179353628c5e79e4f636cba7f42de20ee960dfbd881a0f7a4f5"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 157008, 512, 2, 32, 2, 3, 0, 3, true, true, false, false, false, false, "8464ea22e0fe2feb7946f08178e42325d89254d948a62e36a8cd7dc2ff5df9b7"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 190792, 512, 2, 32, 2, 2, 0, 3, true, true, false, false, false, false, "eb4208e206fb3cb476d2b9b1b8413d461a1bb28a1e47f5ec32bec10ebb02933a"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 32, 128, 32, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen", 197960, 512, 2, 32, 2, 2, 0, 3, true, true, false, false, false, false, "a78a1294a60c48fab26e7668f2d253a843d661ade870b4d597b0d84d85cfbb45"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 64, 128, 64, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen", 152912, 512, 2, 32, 2, 3, 0, 3, true, true, false, false, false, false, "4408b9bdef6b2d2e91d11bae130ef873ebab1668b0376f1ef2817a132b4f16c0"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187208, 512, 2, 32, 2, 2, 0, 3, true, true, false, false, false, false, "8013a6100d4c1d8ef2173fa7d0e8ee8a0de529f56b718295ff26c93885ca6e4b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 191912, 384, 2, 32, 2, 2, 0, 3, true, false, false, false, false, true, "4b773c4db66812005aa8d49de847b3dd6ee28561004db6e897e5946821fdd6d6"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 188200, 384, 2, 32, 2, 2, 0, 3, true, false, false, false, false, true, "90ecfd1553d5f7fd54b5f5c400e7baf428a25045c447a06015843eb2e9159968"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 191912, 384, 2, 32, 2, 2, 0, 3, true, false, false, false, false, true, "4548304083103f858367df5a4ac458c466ef665867a468265ab237e5ac77c3ac"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 188200, 384, 2, 32, 2, 2, 0, 3, true, false, false, false, false, true, "27b9a9ae46315301c009b3a59e1dafc5740fc204a2daee6c55388606e655be88"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 156992, 512, 2, 32, 2, 3, 0, 1, true, true, false, false, false, false, "c1d509ef1a60219f66c03372996be1d2435149291703c0d662ed2795994274cb"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 32, 2, 2, 0, 1, true, true, false, false, false, false, "4ac0900793e73a7ebe4d3b02c0e84acca94612a190052061b4f2a7faf4a35ba0"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 32, 128, 32, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen", 163136, 512, 2, 32, 2, 2, 0, 1, true, true, false, false, false, false, "f5906f1bde7dffccdd5a99ce81081e7d1853a7ba4519d58c60b1a1be3f81c014"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 64, 128, 64, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen", 152896, 512, 2, 32, 2, 3, 0, 1, true, true, false, false, false, false, "0e9360384a86256315abb1fd98ba1278b47baad3f9a0d9aab75b603f28257efd"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 32, 2, 2, 0, 1, true, true, false, false, false, false, "ab76a7e7ea3175330ea386e447282bd0a0223d8e4bda2e9378b87a920d5ec3a4"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 157088, 384, 2, 32, 2, 2, 0, 1, true, false, false, false, false, true, "f6d79ceab42a09cc0c306589362d329e4e264d5df997eb8f981a3208f20dc494"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 153376, 384, 2, 32, 2, 2, 0, 1, true, false, false, false, false, true, "93c90e311c1476dc10ef3a69b0bb04e0d559a21587090527d07dc13e50d66192"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 32, 2, 0, 1, 0, false, false, false, false, false, false, "7c08eb08c50900987639220421d9c22030c67c0e9f3fa2a0f94604faf1df3db7"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen", 165280, 512, 2, 32, 2, 3, 1, 0, true, true, false, false, false, false, "7e86b4c399aec40fb4a3dc9621bec025cc945e136521473296096e270ede3199"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 157088, 384, 2, 32, 2, 2, 0, 1, true, false, false, false, false, true, "58bfbdc2d4867dbfd50c529cf7a53415b5f0224b377cdf068f46fd4d0a5394b7"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 153376, 384, 2, 32, 2, 2, 0, 1, true, false, false, false, false, true, "931db573f51be56dd5c902286db0af4c092b94dc23ac97cc74c018e0dcf10af7"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 42240, 512, 2, 32, 2, 0, 1, 0, false, false, false, false, false, false, "602bd4343a98c1d32e4851dccd460e069e20ca55c8b0819410c881a3f739f9d3"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen", 165360, 512, 2, 32, 2, 3, 1, 0, true, true, false, false, false, false, "0549850915da2da6cf87e442349ea9f1487d11e405267feb97b4024d3bf82833"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 2, 0, 0, 0, false, false, false, false, false, false, "36c0af15e3dce7e9e8a1f3163b99d8dad09ca5e564066fad737c71a0ffa6314d"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen", 156976, 512, 2, 32, 2, 3, 0, 0, true, true, false, false, false, false, "6dfeaa65e89a134c18d6e84dc3cdb197a9f2be3aa0ef72dfaa4699ab8bd2d390"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 158112, 512, 2, 32, 2, 2, 1, 0, true, true, false, false, false, false, "29da643622272584c110058c211d9679a36ad460cee11da88c0f460907081857"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 158192, 512, 2, 32, 2, 2, 1, 0, true, true, false, false, false, false, "7532c6eeac2353a637c2445eacf4435a68bd3a73ebd2a0ade54dc081d13ab7d2"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 32, 2, 2, 0, 0, true, true, false, false, false, false, "8065fd958e47825fd67b3974652fe89ef1e247b066c5832ddf092eb3eb49cf6a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 32, 128, 32, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen", 167328, 512, 2, 32, 2, 2, 1, 0, true, true, false, false, false, false, "e7465b79fba9c5f8d981f5121098f91d2f9bc7cdd7935cd3b2bf1b7e91b6792e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 32, 128, 32, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen", 167408, 512, 2, 32, 2, 2, 1, 0, true, true, false, false, false, false, "ff1678e355c5ff65557d57634baf069f810c8252dfd8b8c2424757e60b7b0ebe"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 32, 128, 32, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen", 163136, 512, 2, 32, 2, 2, 0, 0, true, true, false, false, false, false, "e9e947637b5fe3f4443d1c4486d5444e2007e5a130891823556b474d7b3a0b30"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 64, 128, 64, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen", 157088, 512, 2, 32, 2, 3, 1, 0, true, true, false, false, false, false, "50795243858e25deb1884622f3e56c4dd92ced523791afa939a111ca6205566e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 64, 128, 64, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen", 152880, 512, 2, 32, 2, 3, 0, 0, true, true, false, false, false, false, "40a0999841eef02b9ebe05c2d9283e306c7f7650b5ddb8b1f1609b8f62ea9bee"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 153504, 512, 2, 32, 2, 2, 1, 0, true, true, false, false, false, false, "ce4c86e174ad796a313d8e1eac578e0cd7b390bdbd3eb93ecc10ff54b386f7db"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 64, 128, 64, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen", 157168, 512, 2, 32, 2, 3, 1, 0, true, true, false, false, false, false, "c7025a70afdddfac285c74e5d196d4ae192d2cb2ac651ee2b887a7737e528bb8"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 64, 128, 64, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen", 152880, 512, 2, 32, 2, 3, 0, 0, true, true, false, false, false, false, "7d563d25449c448221f75590ec9df20b3e316d7af489b43a9c830efce802e077"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 153584, 512, 2, 32, 2, 2, 1, 0, true, true, false, false, false, false, "2d1d346da92abc15823a31bd29c8e52568bb8430c649c969c7d82ca906a7d245"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 32, 2, 2, 0, 0, true, true, false, false, false, false, "fc9904f6df6cc7be92f8257578651ed4d6bc04f64bfb07c2643e8c84ff5d31c5"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 42176, 512, 2, 32, 2, 0, 1, 0, false, false, false, false, false, true, "12071ea80cfa1d9ecdee9c37d9dd52577932d76836fee860c223e7bb27a7bacb"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 42256, 512, 2, 32, 2, 0, 1, 0, false, false, false, false, false, true, "d4117320642ec52d7243bef3fb5ce22fd599ec7e461d5f45dbf7c062c2d6807e"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 42080, 512, 2, 32, 2, 0, 0, 0, false, false, false, false, false, true, "c4f0425ddf3f30872fd5ad206de89a9f01b29a2e0081c4638fc93bea074a3981"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen", 159232, 512, 2, 32, 2, 2, 1, 0, true, false, false, false, false, true, "a24bb929838902e039bb1235f519e84e75bf3cfa6db099656dd0336f8a194c07"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 157088, 384, 2, 32, 2, 2, 0, 0, true, false, false, false, false, true, "50f10944c5d42c49a160d951e6c1a89972d4d0247e0367f1dfed7b060e2d1f70"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen", 154496, 512, 2, 32, 2, 2, 1, 0, true, false, false, false, false, true, "9c7747fda9922e79d34ebe2d41c50280a3e0dbd1eec1c15ec37c0b930634a37a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 153376, 384, 2, 32, 2, 2, 0, 0, true, false, false, false, false, true, "221d1f5ec92ffcecc5dc943dd77f2e5fcd6159d3f3a9135ed34885ba2206790c"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext", 164208, 512, 1, 0, 1, 0, 1, 0, false, false, false, false, false, false, "bb39bc98d0c37376603d87a7db9f511687509e3d571e623432b8ba2c46a181ca"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen", 159312, 512, 2, 32, 2, 2, 1, 0, true, false, false, false, false, true, "b8c5f07d11ba87fd47f228261ac7fdbc11861fbd5f20ffc5bbbca5aa07922f91"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 157088, 384, 2, 32, 2, 2, 0, 0, true, false, false, false, false, true, "505addd48075a23ed626f6603f8b1fc4438a4217e63d3987be2ee52e58753116"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen", 154576, 512, 2, 32, 2, 2, 1, 0, true, false, false, false, false, true, "a9576b2d7f0da449f901a9924c130c26ea03308ef750dc58578d27bddaae997d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 153376, 384, 2, 32, 2, 2, 0, 0, true, false, false, false, false, true, "9536b3f98ca03a02f9645e3e76d7a9204b02150298a93ecde3f1cc4bb660d51e"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext", 164288, 512, 1, 0, 1, 0, 1, 0, false, false, false, false, false, false, "c31bde0df951932fbecae2d89d4c551d81cec76596062d008f77d202f90a90a6"}, { DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqQ128Kv128StaticContext", 164112, 512, 1, 0, 1, 0, 0, 0, false, false, false, false, false, false, "fa4d3bfb6a540d309df0a6a6dd32ca2cb92a6282936e26033fdf358abd116da8"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 164224, 512, 1, 0, 1, 0, 1, 0, false, false, false, false, false, true, "a78678fceca5636ea6e65859352be7172a4a960243410ab8f5b76766b5e15a5a"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 164304, 512, 1, 0, 1, 0, 1, 0, false, false, false, false, false, true, "9df99c54b2efab5bebd1fd1d0e3999ed477a71020505b4f787dea98cbe889d4f"}, { DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 164128, 512, 1, 0, 1, 0, 0, 0, false, false, false, false, false, true, "1a82296ab1740ebeec8fd8954b183074dddd89c4583789420f4caa6dbbf17b85"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext", 164208, 512, 1, 0, 0, 0, 1, 0, false, false, false, false, false, false, "0abba1fa0a5c08c840a5c41b5754f8a8d2ec3c644058af9ff349a1b25e052e25"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext", 164288, 512, 1, 0, 0, 0, 1, 0, false, false, false, false, false, false, "9f4fc9842db75596ce2d9e7e938d66a511a95c87191374fa37ac9a85a94a6de9"}, { DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqQ128Kv128StaticContext", 164112, 512, 1, 0, 0, 0, 0, 0, false, false, false, false, false, false, "30b95d13a727b58b3cf53d2eaa987dc878da2bef1d67af151c1a9342de91fa1b"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 164224, 512, 1, 0, 0, 0, 1, 0, false, false, false, false, false, true, "d55699cde1ff93ac463cb67d26bc3b7d9beb3ed2f00f9ead1e8ae5dbaba9f5ae"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 164304, 512, 1, 0, 0, 0, 1, 0, false, false, false, false, false, true, "c62df59cf4b230e838499ebccdc0de4beeaa403812a69ebe4f605425a04ab515"}, { DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext", 164128, 512, 1, 0, 0, 0, 0, 0, false, false, false, false, false, true, "5719d8663d3f99ff4dff212eaf253155d77c9e72c6b3c4eedf5f16ab462dd873"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 164208, 512, 1, 0, 2, 0, 1, 0, false, false, false, false, false, false, "dd43d60c0d366abfbf5948f5fbb016601e9dae8d59390ce12eb3ae1fbce9a173"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 164288, 512, 1, 0, 2, 0, 1, 0, false, false, false, false, false, false, "23c4b90ea73192a4c2f80fb066fc8180c29f3594a31150aa83226ebced9ddb44"}, { DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 164112, 512, 1, 0, 2, 0, 0, 0, false, false, false, false, false, false, "a5fd158208d706208ebf892900e4479517dd6f37d6387b964fd89d6e49614694"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 164224, 512, 1, 0, 2, 0, 1, 0, false, false, false, false, false, true, "e47464d69892007309bb8bb89935b4cf34c31da6bdc67b88c989a89564e4e6dd"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 164304, 512, 1, 0, 2, 0, 1, 0, false, false, false, false, false, true, "7ea15b9a0b482c92852946762c289d86a49849b12bdbeec004b14141cd342ca0"}, { DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 164128, 512, 1, 0, 2, 0, 0, 0, false, false, false, false, false, true, "66282a25bc6f0d62feea6ce813dfbdd69a73b9d9028099d0b5bcb022ef232f73"}, { DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 164976, 512, 2, 32, 1, 3, 0, 3, true, true, false, false, false, false, "2746a1d7fb57d484eef574579e8da70673d174573fdc09cc17362105b81cff91"}, { DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 183400, 512, 2, 32, 1, 2, 0, 3, true, true, false, false, false, false, "f119c7f6b4419509feff271f3a7d9ecc0a88890d4ec112246e1276d80101e2ed"}, @@ -5394,135 +6494,163 @@ static const TllmGenFmhaKernelMetaInfo sTllmGenFmhaKernelMetaInfos[] = { { DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 32, 128, 32, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen", 167008, 512, 2, 32, 1, 2, 0, 1, true, true, false, false, false, false, "1d303817f3a8fdaf4c2791fe57be9a460d1eebcd0560d44f6b92e5d366b3ff7a"}, { DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 64, 128, 64, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen", 148576, 512, 2, 32, 1, 3, 0, 1, true, true, false, false, false, false, "02a56e81eccea68c5594ae0c457a8e0bd4d8d0385b10edf37e114bd4554f933f"}, { DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 140896, 512, 2, 32, 1, 2, 0, 1, true, true, false, false, false, false, "64892afc26a975ff6cd733139765bde21a4f45b4e66d055e041a8562da04b203"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 165072, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, false, "77c27013f34f821a3766120ce8a183cb8a32095d4786bf5a0f2841be1eb31617"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen", 197824, 512, 2, 32, 1, 3, 1, 0, true, true, false, false, false, false, "04c62f283a76cff8202fcfe52a7e8a42f026867901033e2612d70c96294597c8"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 165152, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, false, "e393546e0c6e0bc2d58d0b2681165d32f5308a3c09edbe1a995755715f383f31"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen", 197904, 512, 2, 32, 1, 3, 1, 0, true, true, false, false, false, false, "6fcba903dcc71ade037e0d42494df74886d251c8d0285216b12767c4deb76684"}, { DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext", 164976, 512, 2, 32, 1, 0, 0, 0, false, false, false, false, false, false, "141d9833a5e82bbd24540b42ae5ccd2f0729b217a8da870c6bbad47a422a785a"}, { DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen", 164944, 512, 2, 32, 1, 3, 0, 0, true, true, false, false, false, false, "cb426a2b0be3c476428d7577c8aa7e8cc3f89a90301bd74d98b2f878919f6850"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 153792, 512, 2, 32, 1, 2, 1, 0, true, true, false, false, false, false, "80d0b4c4dd61627f8e29ff4835da7dd7e9213eb48d56b6e93a5aabc53635de9c"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 153872, 512, 2, 32, 1, 2, 1, 0, true, true, false, false, false, false, "1911d6c071f0ed884ac8e716eae111d52fc5a51cf6e1d10a4f7e04774a984dcc"}, { DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 149600, 512, 2, 32, 1, 2, 0, 0, true, true, false, false, false, false, "3b751dc24ba1fc939e22d009fbb4a83399e8bac5c166e0b74e93cf55b931b708"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 32, 128, 32, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen", 175296, 512, 2, 32, 1, 2, 1, 0, true, true, false, false, false, false, "b461058cabdebbb542727639861fc19e7537a6e8bdeb620682032dfdf199c291"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 32, 128, 32, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen", 175376, 512, 2, 32, 1, 2, 1, 0, true, true, false, false, false, false, "c6a7dcf43828d674e4b4bc625d7487c02f82a950cd733beba5a6481c6d820acb"}, { DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 32, 128, 32, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen", 167008, 512, 2, 32, 1, 2, 0, 0, true, true, false, false, false, false, "f89c9865d2791e8baa99c4d9c66ead313c2ed425f7cc2636a9f81de3fda997b2"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 64, 128, 64, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen", 165056, 512, 2, 32, 1, 3, 1, 0, true, true, false, false, false, false, "ef8d22a095e6070f3b0217564dce0ed612397418ae892d21710b11eccd7d45ce"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 64, 128, 64, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen", 165136, 512, 2, 32, 1, 3, 1, 0, true, true, false, false, false, false, "9f111c358d9b7fb73ba12057c7edaa6f4832668d77e3d1050952fdf61e4ae83b"}, { DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 64, 128, 64, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen", 148560, 512, 2, 32, 1, 3, 0, 0, true, true, false, false, false, false, "044ec85e8ac9df15492837f4bd85ea6c1a652d64b8c5cd9bd29135ba08041fa2"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 143040, 512, 2, 32, 1, 2, 1, 0, true, true, false, false, false, false, "8a5eb7f82bb3dca284d11869b765d1122bb56554d6d7dbbe370d349730c5ba81"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 143120, 512, 2, 32, 1, 2, 1, 0, true, true, false, false, false, false, "d2fe373aa922aad5e94639356da7f1b4d4870a34d2178904fb4c0541061f3b1e"}, { DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 140896, 512, 2, 32, 1, 2, 0, 0, true, true, false, false, false, false, "57e754a99c4d440ea04f7a68a49ec69e2bdd6c5850a302056e6416f4c1e3c1ad"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 165088, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, true, "8652da3b6edc149a04d0fcb503093e5d94ef07b83d994bd2ad97481e94c69eff"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 165168, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, true, "d0fb57776cf118937fa0cc0a59cd915791854cf0a4dc31a5635d153018bf39bc"}, { DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 164992, 512, 2, 32, 1, 0, 0, 0, false, false, false, false, false, true, "e7b8f70d83a9cf531fcaad97918ccf6347d57d99e248f7c6124d81684062c7d3"}, { DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 164976, 512, 2, 32, 3, 3, 0, 3, true, false, false, false, false, false, "5817552e3aab120693f6f45673ec5abb5986e3345702fe91c772069a4a6cd2f8"}, { DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 164992, 512, 2, 32, 3, 3, 0, 3, true, false, false, false, false, true, "7acad8f29e27b192fa293de3ed3a61abb00e152335312729b2863a54a7e99d3d"}, { DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 164960, 512, 2, 32, 3, 3, 0, 1, true, false, false, false, false, false, "ef3cf4460aa324f746a148e8a6ed9fc81e14606b3f50783872946cae53e67356"}, { DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 164976, 512, 2, 32, 3, 3, 0, 1, true, false, false, false, false, true, "7bc53a7f63b942950ef61d3de678bff68a61f3f6c3a4ae728df5b31b87c5cefa"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen", 197824, 512, 2, 32, 3, 3, 1, 0, true, false, false, false, false, false, "1f4ff1a3371df326dde3c2c870832fea7c2be272a1a221941006abdfc8008012"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen", 197904, 512, 2, 32, 3, 3, 1, 0, true, false, false, false, false, false, "1975f00cf4c29f7a4218060e9ab7996addb308f7ffb721c34e2d8c62e435c5ad"}, { DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen", 164944, 512, 2, 32, 3, 3, 0, 0, true, false, false, false, false, false, "777e93c76e59328e3ae57edc1afa051fc2d1be7854382f22e06a15449db6467e"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen", 197840, 512, 2, 32, 3, 3, 1, 0, true, false, false, false, false, true, "10b81fe04b33dbcad197eb78ee355ba2ea3e729318156e1d682b07d4c93f353f"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen", 197920, 512, 2, 32, 3, 3, 1, 0, true, false, false, false, false, true, "2a543255872aa94ab1791ab14124606d3fb2c94ed056b8aae865b1f91608fb39"}, { DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 164960, 512, 2, 32, 3, 3, 0, 0, true, false, false, false, false, true, "ace8108556a2a798d5cb1e52f151ee95b985fac51d5437ee69c42fe6396d643b"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 180408, 384, 2, 32, 1, 2, 0, 3, true, false, false, false, false, true, "070f65f093cad2dec0ddc4c96279d9c7904ac5a2fa3baea6c79fe4fb196fd8ae"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 173624, 384, 2, 32, 1, 2, 0, 3, true, false, false, false, false, true, "d9d89155a452e1dffa29c0bcaa41157dc06bf1cd0603e921ffe795ea637385be"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 146608, 384, 2, 32, 1, 2, 0, 1, true, false, false, false, false, true, "9f022308514b430a21a0b908aa45de51716ebbd057cd71ca742327480c21d394"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 139824, 384, 2, 32, 1, 2, 0, 1, true, false, false, false, false, true, "58bc4c94dfa4b6b96b822a97f616ec9accd69847e6bc76df223d909d11fd1dd0"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 165072, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, false, "8b78583301ca1160e65ee41eb00933d7bf652f8ff489c9d454778abddcf10de2"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 180408, 384, 2, 32, 1, 2, 0, 3, true, false, false, false, false, true, "b84c39cb76748929d719921cdf7b01bdf558b0f1296ed2a40372169ffe944f7a"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 173624, 384, 2, 32, 1, 2, 0, 3, true, false, false, false, false, true, "509f7571e6b71f0bbd130c06e370518cf20e1f6a5e19b4698d018927d206b7c5"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 146608, 384, 2, 32, 1, 2, 0, 1, true, false, false, false, false, true, "bea90b7a2835c92c50f52b75c4c404e7979a4db79a485134d97efb8d8130a5c4"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 139824, 384, 2, 32, 1, 2, 0, 1, true, false, false, false, false, true, "d9ab528ea0d003b46f704472b307a4252b601b6a60b441a06a3a70e01c50b89c"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 165152, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, false, "a27a4fb241db99066842c499df08b6b5b44a0eb268ae76a8ef10202ee514e309"}, { DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext", 164976, 512, 2, 32, 0, 0, 0, 0, false, false, false, false, false, false, "d9ae465f6e01fa297ca854a6f3b410e3f4fab1c8fbca20fb1db1a89d534c53fb"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 165088, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, true, "af918abf004a5cfdb960372626e8e4b9146942c9e9498caf653f72e693b5cf74"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 165168, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, true, "91d37de5762afe2b09e9fb351975e7c01261b49df017bc015e581e5a200b85ed"}, { DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 164992, 512, 2, 32, 0, 0, 0, 0, false, false, false, false, false, true, "12931d0323fd17b7234ad0e148de75dd7de6102ac24341c74bb19b769ed5f0e1"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen", 150800, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, true, "510cbe61dbd6fb3d20d3f6d01c6ec6e92e3bb810d41cc6560947b13fec94e922"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 146608, 384, 2, 32, 1, 2, 0, 0, true, false, false, false, false, true, "c3584591575f6523f4d2be252fbb2bb7f4e555af2be975d6a8de41e97d4152d1"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen", 141968, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, true, "a3bf62173fa36aab96765fc654429a3581f3a2522c8a636cd27e9a5f6eb7eef0"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 139824, 384, 2, 32, 1, 2, 0, 0, true, false, false, false, false, true, "e6c28d3a226c115df44a93cf981a2f834d16bc105d3f075bda19b2d10eaa990d"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen", 150880, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, true, "1d2b2fab2f6e3984623466e8b94accde0aa5dcdf8597fbeecbf2f0da2d7c06e6"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 146608, 384, 2, 32, 1, 2, 0, 0, true, false, false, false, false, true, "ce25af6230e0d07a50b2bf9c4dab645e4eed528630a2e9d6cf953f3cd8d80240"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen", 142048, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, true, "a2f9a769c0086285086fc913282f64654f787f8ad9ed04b24ad5ccef3bb604c6"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 139824, 384, 2, 32, 1, 2, 0, 0, true, false, false, false, false, true, "7fdec946d138ab3cf7be473f871b8dc2e61fa1dfa2aeb8925fbcffeb0f1b6fc4"}, { DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 164976, 512, 2, 32, 2, 3, 0, 3, true, true, false, false, false, false, "71ee04372c355dbd2cd9838c24f13f38e50beb7d3c9a8ca0ab2e8494eac20780"}, { DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 183400, 512, 2, 32, 2, 2, 0, 3, true, true, false, false, false, false, "be3533ee5df6fe1093c3e42bc8439e2007a66e2215dfd03cc0385da0bd7c075c"}, { DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 32, 128, 32, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen", 200808, 512, 2, 32, 2, 2, 0, 3, true, true, false, false, false, false, "62af489362cbc97a3a46d2771850a3ff018440b74e3ae64f701191d380ddb201"}, { DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 64, 128, 64, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen", 148592, 512, 2, 32, 2, 3, 0, 3, true, true, false, false, false, false, "f36f06e16ca9cd17126202918113d3e42be019bdd61a80e9ca55755245248433"}, { DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 174696, 512, 2, 32, 2, 2, 0, 3, true, true, false, false, false, false, "9ffa7e259aaad764024a4a58b2d73a5576ae1ffed4daf5ea40e1091be283f463"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 180408, 384, 2, 32, 2, 2, 0, 3, true, false, false, false, false, true, "c36480c6f034d42307b7b81cbd4a73b1662927a8ca1d54d1b58ed436d3e8f012"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 173624, 384, 2, 32, 2, 2, 0, 3, true, false, false, false, false, true, "ee465f74db8fb9d3879ddfcba538e51decf2ff04301ec650ebce6c21c2b1badd"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 180408, 384, 2, 32, 2, 2, 0, 3, true, false, false, false, false, true, "fc0c96b93ff43996216b402f051859ebeb88982ca8e423603d4be76c2dfb892a"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 173624, 384, 2, 32, 2, 2, 0, 3, true, false, false, false, false, true, "dd2cf544922648385fe2a9d36b6590cc911cef5e312800aee95e415996d6c9d8"}, { DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 164960, 512, 2, 32, 2, 3, 0, 1, true, true, false, false, false, false, "65521fcb7bccb33eab1bcb6d65db3fb5556d0d84cf2bcf3345805c18914dd25e"}, { DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 149600, 512, 2, 32, 2, 2, 0, 1, true, true, false, false, false, false, "8d77e88250d7518dee5c2949ccaa49f95cc66ed0f2ebc4093ff0d7fb2c55fbc2"}, { DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 32, 128, 32, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen", 167008, 512, 2, 32, 2, 2, 0, 1, true, true, false, false, false, false, "528f11574a663181c2f6d9ba8b57b76caf94ad917e7d6fa8943b04b20f01e608"}, { DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 64, 128, 64, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen", 148576, 512, 2, 32, 2, 3, 0, 1, true, true, false, false, false, false, "157cb5feb1eb1bb2d4328af396048c4cd504a62a622ad561fb1a8456a95256b6"}, { DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 140896, 512, 2, 32, 2, 2, 0, 1, true, true, false, false, false, false, "5fb7d86cfa0a13b363c19f2486d3ae2dd679b89fa0c80309252e270dee8b058e"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 146608, 384, 2, 32, 2, 2, 0, 1, true, false, false, false, false, true, "074cc016120beaeb4196ae04180b7d30da387fbba7e113dbc052bf2daac7b009"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 139824, 384, 2, 32, 2, 2, 0, 1, true, false, false, false, false, true, "f8be604e4df4f8b38b6b4e9236cabbfc35d62ec5252b0ac05272b648563b87de"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 165072, 512, 2, 32, 2, 0, 1, 0, false, false, false, false, false, false, "6bb4d7ded1efae0db6168cf481d8d0484a64957491c2fef102c9784c3da9f96a"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen", 197824, 512, 2, 32, 2, 3, 1, 0, true, true, false, false, false, false, "f0bd33ab42dd3cd5085c8572c832026740afc0148993ab1a64e8c323ea4608db"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 146608, 384, 2, 32, 2, 2, 0, 1, true, false, false, false, false, true, "06fec04416bc9ed1dbe172091360d74ba515ea1884d46fc1aee3ac5d6a40f1a9"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 139824, 384, 2, 32, 2, 2, 0, 1, true, false, false, false, false, true, "4391265ed89123f37964d9414d7676f49200e0c69f4b30e108829dafe7e8da9a"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 165152, 512, 2, 32, 2, 0, 1, 0, false, false, false, false, false, false, "92d06131b0f67e5644a96134e98336646cb1f75bf26a53fe92d3f28c9c9086de"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen", 197904, 512, 2, 32, 2, 3, 1, 0, true, true, false, false, false, false, "61a72bafdbc2c039f5925b08f49b4d17119fbd888cb8941323e9ded7746f0aed"}, { DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 164976, 512, 2, 32, 2, 0, 0, 0, false, false, false, false, false, false, "c408708063e7db1a6e39afa0ceb44f36a767e882513eec1a5ced3e0bc608af13"}, { DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen", 164944, 512, 2, 32, 2, 3, 0, 0, true, true, false, false, false, false, "9f997b0922faa8da472bf7a96d44608b09d067c657cda056be875facd700e56a"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 153792, 512, 2, 32, 2, 2, 1, 0, true, true, false, false, false, false, "ac7830ec6035a2fa5edf1f0bdf2b7f62fcfd8f18868ec63a7bcf3e323b45f9d0"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 153872, 512, 2, 32, 2, 2, 1, 0, true, true, false, false, false, false, "e7a766591d8cebeebfd79fc0b7837233881240c6bb54e5612c060676d58c715d"}, { DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 149600, 512, 2, 32, 2, 2, 0, 0, true, true, false, false, false, false, "267844730434270f0f488fbb5cff86f29206a632589a3c60ac8e0055a100f183"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 32, 128, 32, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen", 175296, 512, 2, 32, 2, 2, 1, 0, true, true, false, false, false, false, "8333b8d411b3ad5c2d8ff76cb753e931914ab06fac665588005f365e15d1c96d"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 32, 128, 32, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen", 175376, 512, 2, 32, 2, 2, 1, 0, true, true, false, false, false, false, "3a218bd1820fb2efd81d5c67ac7d087b20ab7a4da7387f94d417a554ad25c47a"}, { DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 32, 128, 32, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen", 167008, 512, 2, 32, 2, 2, 0, 0, true, true, false, false, false, false, "4a5b3faf76e4b52e3ed64bdba78ce877653410006e825d34a4584ddce8412027"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 64, 128, 64, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen", 165056, 512, 2, 32, 2, 3, 1, 0, true, true, false, false, false, false, "91587c3605c65a5c42889d64267af56841923c6824bed0170c994e35c7edbedf"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 64, 128, 64, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen", 165136, 512, 2, 32, 2, 3, 1, 0, true, true, false, false, false, false, "afa3a9e1888f4227d8fa03d1875f840d72b94a1d5768168092c58807f49ce9d0"}, { DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 64, 128, 64, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen", 148560, 512, 2, 32, 2, 3, 0, 0, true, true, false, false, false, false, "8122a291faa135387a8aced6e52131ab3ed208e79a96a052366589e77bd83265"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 143040, 512, 2, 32, 2, 2, 1, 0, true, true, false, false, false, false, "beb09742d4745f5e8daa56996082c3ee6fa2041818822cace33c10f72a591ead"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 143120, 512, 2, 32, 2, 2, 1, 0, true, true, false, false, false, false, "0c7d6cc726a6fe6f45a082cb5c815b30fcf82a050111ffe1825f7ae6a7de4cce"}, { DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 140896, 512, 2, 32, 2, 2, 0, 0, true, true, false, false, false, false, "e6f6b8e8cee0d0516d153254af3802af7e25eb83205afc7fec00e3ec25391f72"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 165088, 512, 2, 32, 2, 0, 1, 0, false, false, false, false, false, true, "4453d6a8a12cbf2a9bb3a2b8602dfa50b7329db0007456735bdd862719efa86c"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 165168, 512, 2, 32, 2, 0, 1, 0, false, false, false, false, false, true, "d2099f1e3dffaa1faff87cad8fad129dc7ed0d2717726c31c5f0cba1156d3322"}, { DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 164992, 512, 2, 32, 2, 0, 0, 0, false, false, false, false, false, true, "324638f0209975eba71123553c6682c72ac6f0e6670da849c13888d8262eff30"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen", 150800, 512, 2, 32, 2, 2, 1, 0, true, false, false, false, false, true, "a77a5c3c063f3138ffad1f2b4e193a717f8878751b5fb472b2a9edb4f11e1215"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 146608, 384, 2, 32, 2, 2, 0, 0, true, false, false, false, false, true, "9d59eba038be36560281a8dfafe3106cc20b2f391496c37fa1aa86f8e950d2f4"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen", 141968, 512, 2, 32, 2, 2, 1, 0, true, false, false, false, false, true, "413d6746126abfbb78478ecdb97bfab460c30744476943492877759faf803559"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 139824, 384, 2, 32, 2, 2, 0, 0, true, false, false, false, false, true, "08656c8b9fa9baedb7d70abdc5e2126dc1270a2919e0c95780ef2c417361c2bf"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext", 196928, 384, 1, 0, 1, 0, 1, 0, false, false, false, false, false, false, "bc1e006b847b9dc2d6a2917e62998ae9c5ee6f711f4d9c53530dce5e0b50fd04"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen", 150880, 512, 2, 32, 2, 2, 1, 0, true, false, false, false, false, true, "5ed9eafbeb8186249a07b6f10dd74413f61857e89e2b5c21a358da4704382a39"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 146608, 384, 2, 32, 2, 2, 0, 0, true, false, false, false, false, true, "708e1e07b4c1070e7ff3aa5520e3dd041050fdec57ff94262e3741234d06ea2d"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen", 142048, 512, 2, 32, 2, 2, 1, 0, true, false, false, false, false, true, "22c04dca1e88372d0d5531a694bded106a2436b5a785612f87a705fe758ec6aa"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 139824, 384, 2, 32, 2, 2, 0, 0, true, false, false, false, false, true, "ff7ebbea3fe2363bae5d7b0f005b23bde9f98928b53a4fddf9d9f045facc17c9"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext", 197008, 384, 1, 0, 1, 0, 1, 0, false, false, false, false, false, false, "10b2d9c425a5fce89854dffd3b2e5bb07371a8c12fb6ea766463495e472658e3"}, { DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqQ128Kv128StaticContext", 196832, 384, 1, 0, 1, 0, 0, 0, false, false, false, false, false, false, "d9645bf61090c382dd113e881a3e05e15238744e47c1f1aee79c4504c2dbaa84"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext", 196928, 384, 1, 0, 0, 0, 1, 0, false, false, false, false, false, false, "67dd005961d65bdfcba809240b16a969b57dd546796ae35f389980cd3c7b1193"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 197024, 384, 1, 0, 1, 0, 1, 0, false, false, false, false, false, true, "c9810ccae9bcc14bda6ada9997ab9bfa227210bfbfec8027883c0ad8455f5035"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 196848, 384, 1, 0, 1, 0, 0, 0, false, false, false, false, false, true, "a5e19fc53c110c952c4a01d1090e94b35920394010df4f0209e7f13d2cf7390a"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext", 197008, 384, 1, 0, 0, 0, 1, 0, false, false, false, false, false, false, "02cc90ec4fe13cae78318b8329683cd217687ad6b69f064254f25f8e8f014623"}, { DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqQ128Kv128StaticContext", 196832, 384, 1, 0, 0, 0, 0, 0, false, false, false, false, false, false, "777bb8ac154f9907be9d5c10758b0af56fceab4b8fa6f21073339b30c6453c5a"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 196928, 384, 1, 0, 2, 0, 1, 0, false, false, false, false, false, false, "afe855801d998a10f9b8fa6350f68439f200d30e1ad6932b4e6deab0cc1d8858"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 197024, 384, 1, 0, 0, 0, 1, 0, false, false, false, false, false, true, "46bfbc6cb123bba0ded5b8df1f3ee931d5fa381bcb882411dc683b1f6a4dbe14"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext", 196848, 384, 1, 0, 0, 0, 0, 0, false, false, false, false, false, true, "7a30278d6f681153e9768b829c5d00059f27b98561b2900f9f41bb4342f83f78"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 197008, 384, 1, 0, 2, 0, 1, 0, false, false, false, false, false, false, "c4ad78cebe0b455c98679f9f35207f24489e84e358a2bb9124b0f81ca748bde8"}, { DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 196832, 384, 1, 0, 2, 0, 0, 0, false, false, false, false, false, false, "733d64e078a2a3c7ce48d7188316046b4773633b54a89c57f1ba140fa71a8052"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 197024, 384, 1, 0, 2, 0, 1, 0, false, false, false, false, false, true, "1819103d9bce014810358148a35b6b320ed3663e2d8d4a63824401a081c5e7e0"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 196848, 384, 1, 0, 2, 0, 0, 0, false, false, false, false, false, true, "3adbe3931245dea48d7d4242c85abe0ebb42ac7d60fffb9da63b8398838e9f81"}, { DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 197728, 384, 2, 32, 1, 3, 0, 3, true, true, false, false, false, false, "7bbd34b2698d890fd79be6ff789a2bf603cb24201ce36868543b83c6b26d4980"}, { DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 191080, 512, 2, 32, 1, 2, 0, 3, true, true, false, false, false, false, "d0b6ab6030d6a9638d3087b89c506a33d2320e76f3cd85d378ad31dff3a034e2"}, { DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 32, 128, 32, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen", 216680, 512, 2, 32, 1, 2, 0, 3, true, true, false, false, false, false, "303584d683cf493934db934d0c01de46273ea65dfd7e5f2770f7b04dc60ab1ac"}, { DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 64, 128, 64, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen", 164960, 384, 2, 32, 1, 3, 0, 3, true, true, false, false, false, false, "522a642f4c2686b8172825e791843269a98baa2c13b097829c46d6d55d8a3c66"}, { DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 178280, 512, 2, 32, 1, 2, 0, 3, true, true, false, false, false, false, "d507be22bdbe9d0dbf667e7c3fce804700ec55aca31564506bb83981852d0950"}, { DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 197712, 384, 2, 32, 1, 3, 0, 1, true, true, false, false, false, false, "ee30498b537bb755f781040918d3f13cf116ab9cf9e5588fb02203823c9e289c"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 157792, 512, 2, 32, 1, 2, 0, 1, true, true, false, false, false, false, "9213d1bc69dbe8c4836adc6fb413c57dbcbd18ae7e0f8f2f16864d7823a2fcd2"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 157792, 512, 2, 32, 1, 2, 0, 1, true, true, false, false, false, false, "292108e4d395668d1bc896bde28fdf82d602ffa20ee5b71b417bb3389908bf6c"}, { DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 32, 128, 32, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen", 183392, 512, 2, 32, 1, 2, 0, 1, true, true, false, false, false, false, "b4a8fd9236144f0ba06dda18d25afa75914977e443055895a1abf2d195d8d6e5"}, { DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 64, 128, 64, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen", 164944, 384, 2, 32, 1, 3, 0, 1, true, true, false, false, false, false, "6d1fdcbd0b71d41560a98fc52d57007a08b89b3a38e029252654ca5aef9a8380"}, { DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 144992, 512, 2, 32, 1, 2, 0, 1, true, true, false, false, false, false, "d1b2e0eedd8dc43ef653bee2d0f0894b5cc5104ff214d62e0aa42324fa9ecc15"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 197792, 384, 2, 32, 1, 0, 1, 0, false, false, false, false, false, false, "9de0eec95bc9240fdef0340efe213d217fc1280e5b71f9a322820801045b4971"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen", 197792, 384, 2, 32, 1, 3, 1, 0, true, true, false, false, false, false, "0193e8e75dd07e89156b6d61c92eb329248fc216009e5ce4777ba9e7c1eed81f"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 197872, 384, 2, 32, 1, 0, 1, 0, false, false, false, false, false, false, "f1e7fb767cf375a5caa17221bcfef4a4fd53247ffaa6da23d39d2b3749241712"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen", 197872, 384, 2, 32, 1, 3, 1, 0, true, true, false, false, false, false, "ebc9abbbc841543ca731db2b0aa4cf4e94df41275b4b855271bf1ef0329e8ef1"}, { DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext", 197696, 384, 2, 32, 1, 0, 0, 0, false, false, false, false, false, false, "8e2420732c1edb06a343d75f514b0c85d37d059f65cc80f40bdbb4b54a1d2f5c"}, { DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen", 197696, 384, 2, 32, 1, 3, 0, 0, true, true, false, false, false, false, "cea40e6ee2cce1f9986d1527fa604e97f0eb05489a1e5a693cf51dd33430d69a"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 161984, 512, 2, 32, 1, 2, 1, 0, true, true, false, false, false, false, "def171a4137285bac3ba478eadf594684d96e6598eb357ba0860ca7394aeb278"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 157792, 512, 2, 32, 1, 2, 0, 0, true, true, false, false, false, false, "b16dae38ae5ce1b291a2892fabf8b2e8717f21d56d9ad9dbcee14cf4e1ef4273"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 32, 128, 32, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen", 191680, 512, 2, 32, 1, 2, 1, 0, true, true, false, false, false, false, "734649476fe2e1cd1fe511e763844a85d68c64a3614d80a387eb584a09ebe631"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 162064, 512, 2, 32, 1, 2, 1, 0, true, true, false, false, false, false, "ecfa25acb0f8ecf520246fd002ee801b85780a8c96443ef1cefdf8f69949dd02"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 157792, 512, 2, 32, 1, 2, 0, 0, true, true, false, false, false, false, "f7b77b27c0d907bb24aba6aa6ec785f8c62451f749b3f16e20030a63fec773cb"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 32, 128, 32, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen", 191760, 512, 2, 32, 1, 2, 1, 0, true, true, false, false, false, false, "76fe9b5a7b08c9f06bea0528bf56aa5c327fa1e9dd6b6685aa405f4315249735"}, { DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 32, 128, 32, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen", 183392, 512, 2, 32, 1, 2, 0, 0, true, true, false, false, false, false, "574cf4e6e4e25095b97564e1cbd882d6a10b9b6ee0ec12fc776606f5ae560e06"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 64, 128, 64, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen", 165024, 384, 2, 32, 1, 3, 1, 0, true, true, false, false, false, false, "f564e6543e5f052eae6c44691772eb007dc3b38b66a2d7ce2eff3e9852b73c22"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 64, 128, 64, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen", 165104, 384, 2, 32, 1, 3, 1, 0, true, true, false, false, false, false, "4dcc19a045f633cd692ac99b3a701def0784c8e4038a0d3e3516695befb0de79"}, { DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 64, 128, 64, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen", 164928, 384, 2, 32, 1, 3, 0, 0, true, true, false, false, false, false, "aa0ac490a17913a5209fdd65e39191a0cd14043762d386d9ef17c22a28a2cad0"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 147136, 512, 2, 32, 1, 2, 1, 0, true, true, false, false, false, false, "261d52b065ba3bd7fde4f050237db970f1f9e656a98992d8d0c968484ac5b2a9"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 147216, 512, 2, 32, 1, 2, 1, 0, true, true, false, false, false, false, "3509ba7b5abc02d31edbd0285beeacd6d730585bcc6ec7792f025457700eff2a"}, { DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 144992, 512, 2, 32, 1, 2, 0, 0, true, true, false, false, false, false, "dc21586c50164e956f9dc34f6a269424e926b1813190dee007437ca21de80837"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 197792, 384, 2, 32, 0, 0, 1, 0, false, false, false, false, false, false, "c248734d14a350a8f56a40cc0e92276da0b8a9bb21a17c5979fe489f5503e5eb"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 197888, 384, 2, 32, 1, 0, 1, 0, false, false, false, false, false, true, "b69c67e699b4e78ee5d1a37f520452121c778d161f7894954094c1fbe2738595"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 197712, 384, 2, 32, 1, 0, 0, 0, false, false, false, false, false, true, "c7b5c38fb22a105575b45e05bbdec70f0613b88718a0b0814709cd642c476634"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 183992, 384, 2, 32, 1, 2, 0, 3, true, false, false, false, false, true, "f4e253f98ef703f16978cea8ae675ad9bc514a87422b00d1b75595f2c39b1cbc"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 175160, 384, 2, 32, 1, 2, 0, 3, true, false, false, false, false, true, "d709b8a1fd796c12feaaac6001bd4e303b1032c67a559ccfa83001dc2003d30c"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 150704, 384, 2, 32, 1, 2, 0, 1, true, false, false, false, false, true, "c05a944eb565b0e57b88d48358d6d31c0e8355e3487e4e7272c52d78914061b5"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 141872, 384, 2, 32, 1, 2, 0, 1, true, false, false, false, false, true, "65a73cf31c05dfed35f7f238e2f67788ae263e704feaf4a2df875a16a0fa1d70"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 197872, 384, 2, 32, 0, 0, 1, 0, false, false, false, false, false, false, "bcd6b7300dbef954463d4cc526034b1a8a105b0de3895d5786b23ae15863670e"}, { DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext", 197696, 384, 2, 32, 0, 0, 0, 0, false, false, false, false, false, false, "c7994228148352a5497f3d68026fcc28dd82d24f23e08677afaf81491d3ff9b7"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 197888, 384, 2, 32, 0, 0, 1, 0, false, false, false, false, false, true, "46957e181eb4b0ff2a0432d2464ad06cc69629ad16ae0a6ee4c561aa1d0c2770"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 197712, 384, 2, 32, 0, 0, 0, 0, false, false, false, false, false, true, "692d51a7e7e7201f5ac86929ca8dcef3b20429807fff11655824b034d90f12d8"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen", 154976, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, true, "4067686fc959e068f43f3ca92289a28c0b7b62e1dc3883d92be62da9cac81274"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 150704, 384, 2, 32, 1, 2, 0, 0, true, false, false, false, false, true, "7f0e70b125e2ffb654c56057d20bacc081c89e00798c5bcee636d4a72ce670db"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen", 144096, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, true, "9b87a8ae948d47b0bd361b008e5cac2cfdf6e122acbe88c922d684fc7e5d9942"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 141872, 384, 2, 32, 1, 2, 0, 0, true, false, false, false, false, true, "796eb33506c5baed1dbd2b53e594fd6f3584377a91ac0785c7042c15572129a1"}, { DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 197728, 384, 2, 32, 2, 3, 0, 3, true, true, false, false, false, false, "f7b30ba722194dcf5ea090c9415fa028b9770bf9734f072c1805a66dd5679d22"}, { DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 191080, 512, 2, 32, 2, 2, 0, 3, true, true, false, false, false, false, "0aebceee86f7907800f7c1794e73b84adc9b1caf1b62fae3cb5cd088c253da67"}, { DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 32, 128, 32, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen", 216680, 512, 2, 32, 2, 2, 0, 3, true, true, false, false, false, false, "40e98b0b12b9e631fd993c498e44eef2256075db25eb96bbb6ee6b28565ee02e"}, { DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 64, 128, 64, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen", 164960, 384, 2, 32, 2, 3, 0, 3, true, true, false, false, false, false, "27074882d536187540f5f372d9a2d8ea85360775a7e07b0fa3fef7a10102004d"}, { DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 178280, 512, 2, 32, 2, 2, 0, 3, true, true, false, false, false, false, "fcd3da7cb7d851277bcb90d43b6883bf9df707b129a645bce1a224d88c53ea5f"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 183992, 384, 2, 32, 2, 2, 0, 3, true, false, false, false, false, true, "9afcc9253ddf3ad9f14804da24387d873bd2986ed52b174623b93970aedf75be"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 175160, 384, 2, 32, 2, 2, 0, 3, true, false, false, false, false, true, "da7a39beae1b6e39d9fe5ee04a12fd78db81d900b5cddbabdd3bbd320cef61c4"}, { DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 197712, 384, 2, 32, 2, 3, 0, 1, true, true, false, false, false, false, "0e85ec8c5321328d4e5c39a65f7729726c8247e9494c44e7b9785b28f0d520c2"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 157792, 512, 2, 32, 2, 2, 0, 1, true, true, false, false, false, false, "61b5af4e103013749a93d47322d0c7872a8d3364aba60d4d90f3aad31a9ea5c8"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 157792, 512, 2, 32, 2, 2, 0, 1, true, true, false, false, false, false, "41d9e71183dccf3836e522c96d0c8d9acc3c2efef900cfd8698502fffe270aca"}, { DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 32, 128, 32, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen", 183392, 512, 2, 32, 2, 2, 0, 1, true, true, false, false, false, false, "9ba4812bc395985bea396ecf3fcbf8bcd8e8ec37fb28ddef8219403afe001b6d"}, { DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 64, 128, 64, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen", 164944, 384, 2, 32, 2, 3, 0, 1, true, true, false, false, false, false, "458ba4ba994b71350d688aaeb31b8c4a1d2a4a520e3f2876522a5f23185aad36"}, { DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 144992, 512, 2, 32, 2, 2, 0, 1, true, true, false, false, false, false, "5ca1757ce4a8989d0c4738585fe7d684017ca8496546ed1481cb12af7aac17c0"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 197792, 384, 2, 32, 2, 0, 1, 0, false, false, false, false, false, false, "05cefa26be28d39518273006dea0f5bd0d1425de0a3dae628097c950e6e2d2e4"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen", 197792, 384, 2, 32, 2, 3, 1, 0, true, true, false, false, false, false, "d704cd8ee3d0ea2a588bb0245efc6eddf767e4772a03c5fc31a773ac804399ff"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 150704, 384, 2, 32, 2, 2, 0, 1, true, false, false, false, false, true, "3afd63f7221f561aa8868fd02d639494b2edcff8d349f6b7520759a3de5f0da5"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 141872, 384, 2, 32, 2, 2, 0, 1, true, false, false, false, false, true, "f4527df8637c9dddcaf856c78f08d6b67f4d59f0efd6dba012d2bcdc3df78eb7"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 197872, 384, 2, 32, 2, 0, 1, 0, false, false, false, false, false, false, "908c186fd7511df6b370e5ca900479fa109d636c71114d203f477c9c73cf9f14"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen", 197872, 384, 2, 32, 2, 3, 1, 0, true, true, false, false, false, false, "47865c857b712d5989043e594219fc29cf411c39016e2cd7515aca422fdd62b7"}, { DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 197696, 384, 2, 32, 2, 0, 0, 0, false, false, false, false, false, false, "4660da03de67981542d0f21ac2321b192c7d1077088465bded431d3587f2e47a"}, { DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen", 197696, 384, 2, 32, 2, 3, 0, 0, true, true, false, false, false, false, "d050c396ff8a2b781b85d73bbec7a42e9b10826dac5f6d59364a9e7bddb97904"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 161984, 512, 2, 32, 2, 2, 1, 0, true, true, false, false, false, false, "604fa452c3094e2af740c28254c1057ebc3e1f02c00573c585049d82a0bd0df4"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 162064, 512, 2, 32, 2, 2, 1, 0, true, true, false, false, false, false, "3bfeb1e45138c21c7304cff392284e604c43f8b62e4b195155c219e468fdcaf5"}, { DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 157792, 512, 2, 32, 2, 2, 0, 0, true, true, false, false, false, false, "591b4311571a27b99f4229d16c454e35206e12948c7d0f9d5ef3d5936acaef96"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 32, 128, 32, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen", 191680, 512, 2, 32, 2, 2, 1, 0, true, true, false, false, false, false, "7a8ef0059411f3913c85b08d8ce2e2595a102daee63b6b9676de5a8623909e01"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 32, 128, 32, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen", 191760, 512, 2, 32, 2, 2, 1, 0, true, true, false, false, false, false, "c893a5611c55b848329ea5469107230c01a9b81ab5087f29350decb9a60f9dcc"}, { DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 32, 128, 32, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen", 183392, 512, 2, 32, 2, 2, 0, 0, true, true, false, false, false, false, "4138a4538f725a6d63a73df4b97dda358eed376077c2308d0a59bc38a9ff2acd"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 64, 128, 64, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen", 165024, 384, 2, 32, 2, 3, 1, 0, true, true, false, false, false, false, "7d5d8b249cc3e2962b1c36f20f5abceeb8094d0ef5f7999c0d098858007f91b8"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 64, 128, 64, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen", 165104, 384, 2, 32, 2, 3, 1, 0, true, true, false, false, false, false, "949736fb84868e389a24ee1199fbb9fbeab8d4bf34799d027e6e43973f4ae65e"}, { DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 64, 128, 64, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen", 164928, 384, 2, 32, 2, 3, 0, 0, true, true, false, false, false, false, "eeef2986bfb85c0c44cb6c0e3b704241521aefdb9ace2800f62f5e20b61b9b66"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 147136, 512, 2, 32, 2, 2, 1, 0, true, true, false, false, false, false, "b723583902faa3f3eea4cf089e229375caf1b825d85649b748898923172ec1d2"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 144992, 512, 2, 32, 2, 2, 0, 0, true, true, false, false, false, false, "a2eaef590e3b8dd00e2148805cbf3ae3cedbb02cbfd63ccf0f0ca0883b34fd67"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 1, 0, 1, 0, false, false, false, false, false, false, "f3ecadd889b6927830cad6e90143c5eb2b4977749c337f7197c9af44b482f817"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 147216, 512, 2, 32, 2, 2, 1, 0, true, true, false, false, false, false, "f6e7282897aa8edb3a2f814fec2508a9a8a6bc1a3d8eed83f189439a68374505"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 144992, 512, 2, 32, 2, 2, 0, 0, true, true, false, false, false, false, "a543c36a905ecb3aa653314693e2023859c59b0a8fb34e4fa5956f66ff223abf"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 197888, 384, 2, 32, 2, 0, 1, 0, false, false, false, false, false, true, "d9534a34001117cfcf987449217a28166d0b9be83f3ff9458cb663d5884095e4"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 197712, 384, 2, 32, 2, 0, 0, 0, false, false, false, false, false, true, "4f860bc28b290424c75be9f946199874e0cc2bcef02f59646bf24422058a5640"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen", 154976, 512, 2, 32, 2, 2, 1, 0, true, false, false, false, false, true, "6dafad0525daad008f392c1d5e3dbb8ec60b0343b8a764bcaa035c0ef4ecb8a6"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 150704, 384, 2, 32, 2, 2, 0, 0, true, false, false, false, false, true, "bf64a9b27b737a7845f4173da19ff2d1b05c99eb782430a824673471175b8b9d"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen", 144096, 512, 2, 32, 2, 2, 1, 0, true, false, false, false, false, true, "7c9fe9c3068b903697805561984e73d78d28c1abc1743d8ee9ded3e3bc3bde71"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 141872, 384, 2, 32, 2, 2, 0, 0, true, false, false, false, false, true, "ef9f867590a2ab3ceaf712c3ab52bad3393c74f0910405efd906a1263c4915cb"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext", 82336, 512, 1, 0, 1, 0, 1, 0, false, false, false, false, false, false, "0cbd89f8981af0348589c4c2128b2c12136c81769b18f9bf7d481220f8fe90ff"}, { DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 1, 0, 0, 0, false, false, false, false, false, false, "e16645414ac4e4e5964bd4e63e5b4a12dc877c539965a271f417731da824e503"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 82272, 512, 1, 0, 1, 0, 1, 0, false, false, false, false, false, true, "ecf59d81d569a93e231cb562390b63c3883ea5b6e50f35778d8d4154c85eec3b"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 82352, 512, 1, 0, 1, 0, 1, 0, false, false, false, false, false, true, "705c074bdfa1a8a565837a96cd1ce7c6d0973fceefacb3e9167cc6e5000572ed"}, { DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 82176, 512, 1, 0, 1, 0, 0, 0, false, false, false, false, false, true, "6992106127f9daa1dc2287e12f2333310d49e187ca562b878a61702380e3ef24"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 0, 0, 1, 0, false, false, false, false, false, false, "44456e30d4460a8d6ce82413ed66801ccb26406fbc9748bbdad5bc362ca189f0"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext", 82336, 512, 1, 0, 0, 0, 1, 0, false, false, false, false, false, false, "21b83fdcb53017552c1b0de15cd6b28c56d622dfcd08a5dbef52b9bda141ffd4"}, { DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 0, 0, 0, 0, false, false, false, false, false, false, "05f18b5af25c6a520ae16b23615d463bc2150fe7caae604c87da4fe7468b99ed"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 82272, 512, 1, 0, 0, 0, 1, 0, false, false, false, false, false, true, "36188d864725f5f0a56c368a5de7f1cb5adc52f1c766b9f5f65c62f29ce15f74"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 82352, 512, 1, 0, 0, 0, 1, 0, false, false, false, false, false, true, "060473049f6c794b0b3cd5c2506f176e066e6be64f5c18087ad8983220ea5f58"}, { DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext", 82176, 512, 1, 0, 0, 0, 0, 0, false, false, false, false, false, true, "70ad92709632a45cc5af12514496b378d5623cb052771c301c508464d0966273"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 2, 0, 1, 0, false, false, false, false, false, false, "86756ed545af92a69505851a9e8e1f1643de86e438586f20c39016633cde3e4a"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 82336, 512, 1, 0, 2, 0, 1, 0, false, false, false, false, false, false, "50faa967aa4e77484dd7acd65b311be6380bbb011da858c5dd335dc3629ed0f2"}, { DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 2, 0, 0, 0, false, false, false, false, false, false, "93bba920e6b432daf9597f97afc38d5c0fa625416f7c31a8a2caa3b27e324c57"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 82272, 512, 1, 0, 2, 0, 1, 0, false, false, false, false, false, true, "03f33be0b3eadc24433193b3ff432826490e024f9a17cdb17e949240c1f49757"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 82352, 512, 1, 0, 2, 0, 1, 0, false, false, false, false, false, true, "19d97cc314f44481e1cc8453dd12f2320c873afd281e8971f410256f17475de0"}, { DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 82176, 512, 1, 0, 2, 0, 0, 0, false, false, false, false, false, true, "b6c9cb1ee577c45f99b97b5e658e5901774ce9978d356a4fc6436db0bbd1e422"}, { DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 165056, 512, 2, 32, 1, 3, 0, 3, true, true, false, false, false, false, "277901588c9df647ac9a8c876a9b82d61904396fd7c2d3801b49b91fbc59ab93"}, { DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 196792, 512, 2, 32, 1, 2, 0, 3, true, true, false, false, false, false, "b8d79d9f3b9ecbdd20dcff8c7b9a6589b80003fb08aa646f9d982fbe84eabeb7"}, @@ -5534,521 +6662,617 @@ static const TllmGenFmhaKernelMetaInfo sTllmGenFmhaKernelMetaInfos[] = { { DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 32, 128, 32, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen", 175280, 512, 2, 32, 1, 2, 0, 1, true, true, false, false, false, false, "3852e9ec1881607c4df9805c3d8158a8de7cc3225925bc03f42c8dde29bb8a10"}, { DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 64, 128, 64, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen", 156848, 512, 2, 32, 1, 3, 0, 1, true, true, false, false, false, false, "35cf7696a1b709fdb5b21f996de3c81f040d0889bfdb27e9de1cb35e92f85044"}, { DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 32, 1, 2, 0, 1, true, true, false, false, false, false, "2e5ab1bc619b02a19ea6bc25627cb0147ede5e90e1c4507eb186ab6856bf9633"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, false, "4768fe4a489286755f49cfebf7242d6a178845a827d014a6109a72f79aed909b"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen", 181520, 512, 2, 32, 1, 3, 1, 0, true, true, false, false, false, false, "d32e19a6c48cccd8e0600b36293455aec4f066872a00e2d4c90f1c9446303e81"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 83200, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, false, "f7200b700dfa0e889f8f0e7cdfc665183ed64886d8dfdc66feb8edbb9f0e20c5"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen", 181600, 512, 2, 32, 1, 3, 1, 0, true, true, false, false, false, false, "4251af5a935824c3a8eeea4aabfd8ee8d5918303045126fda089e0ec989cdc6e"}, { DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 1, 0, 0, 0, false, false, false, false, false, false, "97f87ea64e84eb6050375021e12617fec439efd54c606b616134117272653b53"}, { DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128StaticKeepsAbForGen", 165024, 512, 2, 32, 1, 3, 0, 0, true, true, false, false, false, false, "b5441eb8c46dfba8d4addad604d2360c3a996fb034f790f1567da4fbe7ec1cb3"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 164112, 512, 2, 32, 1, 2, 1, 0, true, true, false, false, false, false, "0c24292a1be1dac703ebc497e21e21cb5fb88bde21fb760f5970ab03c818f1a3"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 164192, 512, 2, 32, 1, 2, 1, 0, true, true, false, false, false, false, "55f9966f112d38a59eb293f1301988753e232938794105a54e5e308d76e32e05"}, { DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 32, 1, 2, 0, 0, true, true, false, false, false, false, "19d06c62ff80e69e57a0233375aee8c4850e4afeee892de4410498ad83ff542d"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 32, 128, 32, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen", 179472, 512, 2, 32, 1, 2, 1, 0, true, true, false, false, false, false, "7020e81ea01aacb3a82793dff6274e67ebb3431a2f26bef5bbc25e2bd635afc2"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 32, 128, 32, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen", 179552, 512, 2, 32, 1, 2, 1, 0, true, true, false, false, false, false, "1936970b3f43117493788df08f89a23d17a13d7b126308d3c454bddff0cfd684"}, { DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 32, 128, 32, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ32Kv128StaticSwapsAbForGen", 175280, 512, 2, 32, 1, 2, 0, 0, true, true, false, false, false, false, "7ccefb53d0514f0f681e34cd8e5f373ddaedc47aec00b841c74b1e344a8a8efc"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 64, 128, 64, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen", 165136, 512, 2, 32, 1, 3, 1, 0, true, true, false, false, false, false, "2ee5527357efe00e3348f39389ec137c507b6333e269524abefc9f9e1614a1c0"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 64, 128, 64, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen", 156832, 512, 2, 32, 1, 3, 0, 0, true, true, false, false, false, false, "7ced6570d01720eb9fab2228ce8ce93dd131d25106ee73bfa1eefb4dac5d9865"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 156432, 512, 2, 32, 1, 2, 1, 0, true, true, false, false, false, false, "3c14511d154f8ed33afae8cfac4f2d595834c3e72e807c2fef9ce13030c86637"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 64, 128, 64, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen", 165216, 512, 2, 32, 1, 3, 1, 0, true, true, false, false, false, false, "d685011d90a426c1b3d708b6688576da3a587eeff13fb388ea35aee56e18317e"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 64, 128, 64, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ64Kv128StaticKeepsAbForGen", 156832, 512, 2, 32, 1, 3, 0, 0, true, true, false, false, false, false, "0a6521e3e2b2811507bd0989e238d5a6c50bfc8164249d210d2ac2ad67fd0a93"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 156512, 512, 2, 32, 1, 2, 1, 0, true, true, false, false, false, false, "840e9b289f9812b334a883382198feb3037faa7794c40c76578436a77ae4c3ba"}, { DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 32, 1, 2, 0, 0, true, true, false, false, false, false, "5a731bbdb505ead7e85b9e9a6058f5955cd2cc4958e281f8fb2cd6c051a8831f"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 83136, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, true, "35adf327a1bac88e0322fda466b1380e96c1e579b415a05f90b7ae1e05fb5479"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 83216, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, true, "44db4c595f9aaab956383ad569f4c3f1214b26e9bce2d87c22a0bb83f27a7921"}, { DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 83040, 512, 2, 32, 1, 0, 0, 0, false, false, false, false, false, true, "117515778fca13a55758b789d47a31229d40a763fb62aa54bff9d413e698af1f"}, { DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 165056, 512, 2, 32, 3, 3, 0, 3, true, false, false, false, false, false, "4109afb53fc958af35a463c5c7e21014c2872f9ee6855a37bbd02a633da81917"}, { DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 165072, 512, 2, 32, 3, 3, 0, 3, true, false, false, false, false, true, "a37ed123500ddb0fed6d57fb04d852868470cb06ad3e2e3444277ff896582181"}, { DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 165040, 512, 2, 32, 3, 3, 0, 1, true, false, false, false, false, false, "a4a6706464308f44f91957f940b8252a424b338809f282c40971f202cf2aa2ce"}, { DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 165056, 512, 2, 32, 3, 3, 0, 1, true, false, false, false, false, true, "c487dad787aeee935cb5cbacc81c1127feaa194571e9377031c49e45ea290ad6"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen", 181520, 512, 2, 32, 3, 3, 1, 0, true, false, false, false, false, false, "ae2cc64ae0685552413758e199c966a3ef5ea47da8436ffb5a4505ec12f215a9"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen", 165024, 512, 2, 32, 3, 3, 0, 0, true, false, false, false, false, false, "bb957804a3a64f5afdd560014afbd240642eaf1eb6c982c25b2e5514693feb5d"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen", 181536, 512, 2, 32, 3, 3, 1, 0, true, false, false, false, false, true, "a2cf05a947d60e515cb2127873b518de838802e6b02df7116d23386d66baa221"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 165040, 512, 2, 32, 3, 3, 0, 0, true, false, false, false, false, true, "77ba43179b8d307cf5b350c5eed99fab2d95f8f631ee707585411551c986637d"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 195848, 384, 2, 32, 1, 2, 0, 3, true, false, false, false, false, true, "946287790c26e106f153e2a243d797b1ccbd64acaa30d641da53451e49359bc8"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 190088, 384, 2, 32, 1, 2, 0, 3, true, false, false, false, false, true, "bc883dcde2dbbc202c8ad7305454e0cad63e793287160bb9f5e2921108174d51"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 161024, 384, 2, 32, 1, 2, 0, 1, true, false, false, false, false, true, "ad36fe5c2d9c2d264619700f40236cb78c7dafc3d2afdbf58d3d435b8cdad2b4"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 155264, 384, 2, 32, 1, 2, 0, 1, true, false, false, false, false, true, "6cc8e1c6fafeb6ef35b064512ca47c5bdcf058c7cc9c006c4025daf75cd8ec10"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, false, "ce19bd2337a8f510936537f762b4db060f6a3452f4580271904b373ae42eaf09"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen", 181600, 512, 2, 32, 3, 3, 1, 0, true, false, false, false, false, false, "d11890934725308610916778e4e295095a11b961394eb59a9a8ac37214b401f3"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen", 165024, 512, 2, 32, 3, 3, 0, 0, true, false, false, false, false, false, "f6054df4e40d36d00fe47a341c3125dce9444824888a17de784928f2ab76dbd3"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen", 181616, 512, 2, 32, 3, 3, 1, 0, true, false, false, false, false, true, "33f8e3d8aa0f6bc109fd05c284e8a72b6854086a6c7f05dcc4e03affeef791e0"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 165040, 512, 2, 32, 3, 3, 0, 0, true, false, false, false, false, true, "f2dcfa1c4e1d13f641dcd43d84f3e2e850385bdcdc722305af8ece3c93fdd376"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 195848, 384, 2, 32, 1, 2, 0, 3, true, false, false, false, false, true, "f7d5c55bb59c7e1c9f92490b6dfdda83e676b8d0ab357af372d59a518ce665e4"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 190088, 384, 2, 32, 1, 2, 0, 3, true, false, false, false, false, true, "033d1fa624190961ad4dd9152dbe841d3064d4267f6e8311304ebfab3084f096"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 161024, 384, 2, 32, 1, 2, 0, 1, true, false, false, false, false, true, "65e9096bef0fb43e1fbaf054ea06d7bda13a3426285f10850c21c6c08c3572e1"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 155264, 384, 2, 32, 1, 2, 0, 1, true, false, false, false, false, true, "9a554d3c382bdbfbb34b8edbfc37e7fd87b36e8d42356faaef15645dbb37ce60"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 83200, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, false, "74d87bc63b4124b8f562d93eaa6f8232a4f5907d3f55b1dff042cc0ee0409ac4"}, { DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 0, 0, 0, 0, false, false, false, false, false, false, "10a602225801534833dbd5c41f14d461bb9f558e73ac9735ffdb5871c89f52fb"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 83136, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, true, "4ccb3e389022027d354e53670a8b4d7858763dcd787c72a65b084eb7db141a4a"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 83216, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, true, "013612d1b2ac8749a6ad18e9e4be376f282a880b781ffef9eefbaa07490e5a3a"}, { DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 83040, 512, 2, 32, 0, 0, 0, 0, false, false, false, false, false, true, "57f9b1421fd55a99b4e9a290a1b38dce2b2d8a5a641dce024182222dbac80044"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen", 163168, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, true, "7c2117451506794fd3bacbcab1c7119b92720afa5eb0cfa8de3f2ba3b26cc6fd"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 161024, 384, 2, 32, 1, 2, 0, 0, true, false, false, false, false, true, "d3bb6656aa67ed4433921f6e91d779acb7666d774d2f7144237f2be6c57aeeaf"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen", 156384, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, true, "69270532d77ef23398cfd0b96440d43d956f04cabfe483a332fe3562f2e79ab5"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 155264, 384, 2, 32, 1, 2, 0, 0, true, false, false, false, false, true, "381b31178140fdf8d32e6bc619a2c97b8f2d0cdf4a7004a03dfca752446fcf13"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen", 163248, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, true, "55860e02ec851d78feb430454305521a68581c6faef0a607dc9882f5fc20cc44"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 161024, 384, 2, 32, 1, 2, 0, 0, true, false, false, false, false, true, "134ddfd4d1c7c8625af7d281a3bb477340006dcdcad36d57b180a8b4d22e8472"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen", 156464, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, true, "b28234a2e9301150a5dd2eb762c080444750b06ab690d58f299f186e4bd85b53"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 155264, 384, 2, 32, 1, 2, 0, 0, true, false, false, false, false, true, "46ee76229822477d6593653543a67ac20644ce6e701b929854378642d4563c3c"}, { DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 165056, 512, 2, 32, 2, 3, 0, 3, true, true, false, false, false, false, "58e39217a707eee1759654d2498b6999c640f43ff4eebedd50697c840c0c6e86"}, { DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 196792, 512, 2, 32, 2, 2, 0, 3, true, true, false, false, false, false, "e856e20ce5b8b397ae8edb85fcf4db6178386bbe3a45b823cf554f3310c5e7e3"}, { DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 32, 128, 32, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ32Kv128StaticSwapsAbForGen", 210104, 512, 2, 32, 2, 2, 0, 3, true, true, false, false, false, false, "a7ac43288709d2a42e53df8f093202a8db7c75b1870efc2b8b438e8043a98e2a"}, { DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 64, 128, 64, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen", 156864, 512, 2, 32, 2, 3, 0, 3, true, true, false, false, false, false, "f3ec32987731f17df7d61681a649be6f222d29cda50fad212371933e86b4b502"}, { DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 190136, 512, 2, 32, 2, 2, 0, 3, true, true, false, false, false, false, "3888400064b479e82a60d5de4a20d5f7872e3df7c9a0be14e4e155b3d7c28738"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 195848, 384, 2, 32, 2, 2, 0, 3, true, false, false, false, false, true, "3f4e6f95d4da564bf27ab12b7f31e07df71a055624da26cfd9c3ef960d604949"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 190088, 384, 2, 32, 2, 2, 0, 3, true, false, false, false, false, true, "9bcfe88314bb8838222482d202205ccb8ecc05c674dd6388ebe3db5ca71e41b3"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 195848, 384, 2, 32, 2, 2, 0, 3, true, false, false, false, false, true, "2f342efae600d6e1f72cf43c1d3ab645fba6a53e3991d9d18ba156c3345a9b63"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 190088, 384, 2, 32, 2, 2, 0, 3, true, false, false, false, false, true, "554d579edb18b80f7a1f9c723beeedba3ad74697832770d43f2d413e41dabc58"}, { DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 165040, 512, 2, 32, 2, 3, 0, 1, true, true, false, false, false, false, "33ae00c720f7d316e4d8ba8a719b094778f8e676dd54457abf0853ab8f59d077"}, { DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 32, 2, 2, 0, 1, true, true, false, false, false, false, "55ec96afd1a15ae697dc89d5f4f7359947945146de98d25443cacaf1fec6645b"}, { DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 32, 128, 32, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ32Kv128StaticSwapsAbForGen", 175280, 512, 2, 32, 2, 2, 0, 1, true, true, false, false, false, false, "0ea2394d2b653225baf125813c7389f06ea9eb0f705f7cfa9ca3a5a32e3d1de9"}, { DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 64, 128, 64, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ64Kv128StaticKeepsAbForGen", 156848, 512, 2, 32, 2, 3, 0, 1, true, true, false, false, false, false, "d62cc50a6859c818a11cb01d319cdb0081b017fb88247bb79fb0e9845ead81a4"}, { DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 32, 2, 2, 0, 1, true, true, false, false, false, false, "dbb20a16a0a7734f68b82e9be8bc3661a8ba7abc925cbda7c977f91ff3655978"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 161024, 384, 2, 32, 2, 2, 0, 1, true, false, false, false, false, true, "e42faa1b513104c3a9e7f2a8e694503b8d0a47541055bf776ac73bed68e02c93"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 155264, 384, 2, 32, 2, 2, 0, 1, true, false, false, false, false, true, "f8b14c48e080d07fd909a0d568824b4b0a9db0b3a1589236f15a90fb1b74c2d9"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 2, 0, 1, 0, false, false, false, false, false, false, "b74bd3431c44a6053fa327cfa9f7597abb4d293dab96b38d82d28ff8dd38cee1"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen", 181520, 512, 2, 32, 2, 3, 1, 0, true, true, false, false, false, false, "49386da343731c9a4390e368fd51da96da397df6360ce623204cca9bd76df0db"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 161024, 384, 2, 32, 2, 2, 0, 1, true, false, false, false, false, true, "09595d52dca247653c91d59ac14044d6e1437ecfebd3a22dc377390ef1ca8617"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 155264, 384, 2, 32, 2, 2, 0, 1, true, false, false, false, false, true, "a66ec6ebbcade92cda1a8ff4de28ec95cfcd0134fa776066b8f2e642ba6e3632"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 83200, 512, 2, 32, 2, 0, 1, 0, false, false, false, false, false, false, "71ef06f620acf3cbf71a2cba6f68236065929a7d3fdc05edeab7d7c19c9f7aa2"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentKeepsAbForGen", 181600, 512, 2, 32, 2, 3, 1, 0, true, true, false, false, false, false, "1654a7aed18d37fe8c5f0536cc459a064796a8661bd2c7ab6e08ad72f4b59d59"}, { DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 2, 0, 0, 0, false, false, false, false, false, false, "6be66d7c23e7fcee0f255335624bc83dd82a02744a4aac5eff8fe2cfd011e242"}, { DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticKeepsAbForGen", 165024, 512, 2, 32, 2, 3, 0, 0, true, true, false, false, false, false, "d300f33db1f1d8182d5b902e7eba0882bb014b613bc981e3862206e5d7052e7f"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 164112, 512, 2, 32, 2, 2, 1, 0, true, true, false, false, false, false, "0a10ad723129f7d435eb53db6c5e4010203d61b3ee468e79fb94d01f7eefbacb"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 164192, 512, 2, 32, 2, 2, 1, 0, true, true, false, false, false, false, "1eee8fff5e517f7f829bb02cc71a5cfcc632ba9e12cc60ab488e114f1b2648c2"}, { DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 32, 2, 2, 0, 0, true, true, false, false, false, false, "b272c8cb57d1bc1bc5357f5580332420b021eac73fce8f5165c7c4d71aada4dd"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 32, 128, 32, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen", 179472, 512, 2, 32, 2, 2, 1, 0, true, true, false, false, false, false, "d82cf8478c1b6710644f59ea229fee0df3c3dd2cccccf964be5bf64354a3e2c9"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 32, 128, 32, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128PersistentSwapsAbForGen", 179552, 512, 2, 32, 2, 2, 1, 0, true, true, false, false, false, false, "057e6e001ab2ce7b002832692beb9581717242feff9bf0e66d2e1179c5bfcdb7"}, { DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 32, 128, 32, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ32Kv128StaticSwapsAbForGen", 175280, 512, 2, 32, 2, 2, 0, 0, true, true, false, false, false, false, "7daa9877baa3f6d30d33a619f8ffc39ea922e7b549884d6bae2fa8a675d40f5d"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 64, 128, 64, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen", 165136, 512, 2, 32, 2, 3, 1, 0, true, true, false, false, false, false, "f878fe99854c526fc15a44a0292d4be4831716e02ba65e77395d3910e03ebf76"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 64, 128, 64, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen", 156832, 512, 2, 32, 2, 3, 0, 0, true, true, false, false, false, false, "5e3792dab55a20ee210590a4170b54a1aec11e5caec4eedc45abb893ed5aacca"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 156432, 512, 2, 32, 2, 2, 1, 0, true, true, false, false, false, false, "653776a2bffdb0481119502c3f32838acb0cf0f8a8848209e81f98a25209014b"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 64, 128, 64, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128PersistentKeepsAbForGen", 165216, 512, 2, 32, 2, 3, 1, 0, true, true, false, false, false, false, "64e2de81db46ea4936ec3f36e20d86d26b8205df3b21bc28c75639073e1e6fd7"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 64, 128, 64, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ64Kv128StaticKeepsAbForGen", 156832, 512, 2, 32, 2, 3, 0, 0, true, true, false, false, false, false, "51b93ec111961e12ef6e51edb36c3ae695bbfe2c4e8364f926868710b132909d"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 156512, 512, 2, 32, 2, 2, 1, 0, true, true, false, false, false, false, "970c635c1c8192c9ca8841386b1b95a93ac3456ac2e92f290948139806a58fdb"}, { DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 32, 2, 2, 0, 0, true, true, false, false, false, false, "c6ac28543443ea7293afff32fd8b54e70ddb02af02d321ae3bab5192c72ee9a0"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 83136, 512, 2, 32, 2, 0, 1, 0, false, false, false, false, false, true, "84376f221db9fd0dfe1043dad50d06f1b29c77c4fa6391833dcf88a9e2cc4c07"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 83216, 512, 2, 32, 2, 0, 1, 0, false, false, false, false, false, true, "024dcca05b5191c38676cb674a96fd3c0d43b15bbe5a5fe71d7f39e4baba9385"}, { DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 83040, 512, 2, 32, 2, 0, 0, 0, false, false, false, false, false, true, "ea95f8c5b638cc4df685ad2ad88d97fc2b3ceba9e9175b922fe8684d0b8842d8"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen", 163168, 512, 2, 32, 2, 2, 1, 0, true, false, false, false, false, true, "56cb0b0e056910cbec25292e661f22f0f918a11d36d2e82f32ab3d41951bbae1"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 161024, 384, 2, 32, 2, 2, 0, 0, true, false, false, false, false, true, "1429b0ca69acd39afbf903815fcd6a87761f9b67e9a708645b15cb621d69848e"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen", 156384, 512, 2, 32, 2, 2, 1, 0, true, false, false, false, false, true, "c6e8c6e498573b22f42168eaa63478d25ecee1d9f2570da5a4df4f1c3fef5082"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 155264, 384, 2, 32, 2, 2, 0, 0, true, false, false, false, false, true, "229a2a798c12e9df06c693330c2fee42e03122418a68a466f343e00d66831c5e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 127216, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, false, "bef1fe20db8bc72e938e606f9b0f10f0f620707d0eb67b3bfe7415356ec5d05d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP32VarSeqQ128Kv128StaticContext", 127120, 512, 2, 32, 1, 0, 0, 0, false, false, false, false, false, false, "978fcc07e932d43f84d725288b97ac0b57d29cbc69d58b427609a36eb9aa4ba4"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 200808, 512, 2, 32, 1, 2, 0, 3, true, false, false, false, false, false, "09006b8f6adae0a57d0b2ce6ba8ca1464acabfa60376b6ccdfde6639e2c2cb63"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 196344, 512, 2, 32, 1, 2, 0, 3, true, false, false, false, false, false, "acc7a67ed6cf3136c01d24fde94a95f0b3a69e04ffb808c350770e53cb5f4cc5"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 167008, 512, 2, 32, 1, 2, 0, 1, true, false, false, false, false, false, "8e317a29aeafc0c30127b6cf7ac7ab82ca066c8aa7fe16e070625e5452610ad8"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 162544, 512, 2, 32, 1, 2, 0, 1, true, false, false, false, false, false, "0ce67805ce26ba5fff27467243f7b94bd21a9e5e742a9f66ee08b3fe3ff0f305"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 127216, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, false, "ca31317fd01ad455a579d09d7c1e0ad8f0fdcc096a390f7aee6ecac7c0e6bfad"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ128Kv128StaticContext", 127120, 512, 2, 32, 0, 0, 0, 0, false, false, false, false, false, false, "3468aad52b892a2051b2d9e0553c486e38e06243c37cb45a76166b290079ada2"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 169152, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, false, "e091a9e3e1c208757fd097c7b6018b91e64e0524bd0ab95e5de7c8213312d076"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 167008, 512, 2, 32, 1, 2, 0, 0, true, false, false, false, false, false, "c237358426082fa6c8e0511c7a22c3a11a51c33832a2af4451da9615d5d75ebe"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 163664, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, false, "79dfee8b0e6ee6bc196812b142d9b5200f16906b1ea2d77052c8b520e1cdd4e6"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 162544, 512, 2, 32, 1, 2, 0, 0, true, false, false, false, false, false, "e45f5149a283abac85fe5aa17836953b9c787d1c16f5adf3f59ad513083a0a95"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 200808, 512, 2, 32, 2, 2, 0, 3, true, false, false, false, false, false, "707913f91e18a887f974a88409194dfacf921efd3713f86ef81c2ba9941010d3"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 196344, 512, 2, 32, 2, 2, 0, 3, true, false, false, false, false, false, "582e963436971e3e94e1224ec6e97505432c754a43684d19e00f472ca170f5f6"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 167008, 512, 2, 32, 2, 2, 0, 1, true, false, false, false, false, false, "6b93ceefb5ff13722db1bd27c748c15db99171a48468a6eb20f830e81f6f1169"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 162544, 512, 2, 32, 2, 2, 0, 1, true, false, false, false, false, false, "adbda1fe081df9bfe4cbacdf9154ff33468ee35b9d4891f939e74e26eac9fe00"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 127216, 512, 2, 32, 2, 0, 1, 0, false, false, false, false, false, false, "b5c9a3ddb56468a4da0ac60dcf0d7353259974538e0149e32b8761b6d25bb18e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 127120, 512, 2, 32, 2, 0, 0, 0, false, false, false, false, false, false, "a33973395c5c5ee501329bbefdb29224d0765401e7b76d9281926745a6ad531c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 169152, 512, 2, 32, 2, 2, 1, 0, true, false, false, false, false, false, "d6de97988abb418830c674cdfaed4fcbd8eb6f14c80510ab1e6f96355a38a5a1"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 167008, 512, 2, 32, 2, 2, 0, 0, true, false, false, false, false, false, "e2185648bd0a8396cc9a375f90bddf57d1360dc102c55a5fe966436a2594b295"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 163664, 512, 2, 32, 2, 2, 1, 0, true, false, false, false, false, false, "34471111b04d8e9f28551ddd271ce6735a1a23128c4b9d4ad4aaa03cfe517890"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 162544, 512, 2, 32, 2, 2, 0, 0, true, false, false, false, false, false, "dea88bc854c62c7f573cf50e2d5311e58a8fc4c87d866468a89c5be5fcd9a331"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 224576, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, false, "3dbfc59b5f99207306c2586703ce6eaa8420224df9fb34580036e3b184d986b0"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP32VarSeqQ128Kv128StaticContext", 224480, 512, 2, 32, 1, 0, 0, 0, false, false, false, false, false, false, "ced969b3eb363f58cfebb555512ab6ee1334483c9f301ea8a336c74e82b2b55c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 213608, 512, 2, 32, 1, 2, 0, 3, true, false, false, false, false, false, "9c5937bdcf460c55a167885e9bbd4274de2abcb5cdbd28d805250831c32cdf81"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 207096, 512, 2, 32, 1, 2, 0, 3, true, false, false, false, false, false, "7410571335e86dadc0ab99eb75d3c8ee8b2d702a2ddc044db8cab4f3b18cb8f7"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 180320, 512, 2, 32, 1, 2, 0, 1, true, false, false, false, false, false, "64a5a1c6e394d0c43286a217e4f863214dd9e65b9a678409cd233d7ec7a5bcfa"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 173808, 512, 2, 32, 1, 2, 0, 1, true, false, false, false, false, false, "aab962e3f59a8ea3aac7e7bcdc6e21f279b1c7c13ba773c25c8f557a28bbef5e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 224576, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, false, "230be3f893eee9e559c5e7441be042cbd7729d053d5a866ddf6cfe720d33a97c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ128Kv128StaticContext", 224480, 512, 2, 32, 0, 0, 0, 0, false, false, false, false, false, false, "939070b5520652c37e52b4746273562a64c33f7cd619ba939dd3e61335f32f04"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 182464, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, false, "36394bec415734be094109f09c91998485ff00a4c4ccc3abd56fe3d41c2fa98f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 180320, 512, 2, 32, 1, 2, 0, 0, true, false, false, false, false, false, "ee0a090e45f0bcb1686d1e5273e1b6c05209d860e650c81c1cf0804c8bb055a1"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 174928, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, false, "6627261a93de7b8ed22a1ecf4f0fbfa21942ff3facdfb65e8ff421cef62cbf5c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 173808, 512, 2, 32, 1, 2, 0, 0, true, false, false, false, false, false, "d66498c51f51478b129ece26151b9da22a1d1896351338c9f24fc1db431b6688"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 213608, 512, 2, 32, 2, 2, 0, 3, true, false, false, false, false, false, "96df0f138dfdb91f3e6c6f37101e15d885e95c477951f57af0be48731dec9c5f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 207096, 512, 2, 32, 2, 2, 0, 3, true, false, false, false, false, false, "989bd5ca1663d66b5269edaa33049b14adf27f8d4a79b2f87cd5ffc12c68c66b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 180320, 512, 2, 32, 2, 2, 0, 1, true, false, false, false, false, false, "86d622ab72db15e7da8c2d90064f5de5ba0162a28bc9cbe147206cbe3c8239ee"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 173808, 512, 2, 32, 2, 2, 0, 1, true, false, false, false, false, false, "98928bfd9762c92efb797d1a70f3ef6ca25bc542693634b09343160e83388fa1"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 224576, 512, 2, 32, 2, 0, 1, 0, false, false, false, false, false, false, "cb14015d738ae5e7c0870d6660634236bd3508eabe2cc277fa2ac9331a6797ef"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 224480, 512, 2, 32, 2, 0, 0, 0, false, false, false, false, false, false, "ba624fd133ef7f961d64879d95d30ee6fc2b73fbec1bb36c0b80473b1afee972"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 182464, 512, 2, 32, 2, 2, 1, 0, true, false, false, false, false, false, "2ba690a335b05a76753ce584462249c29e80c4afb17fbc17830bf9f109cfd505"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 180320, 512, 2, 32, 2, 2, 0, 0, true, false, false, false, false, false, "49e19ee4875fa6997dbe7fb0680220c169e75988de625e3e751846835a891b21"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 174928, 512, 2, 32, 2, 2, 1, 0, true, false, false, false, false, false, "f744bba6052f463165125e1689e499846ccfc6d148c42c39d0b2fb8200348e35"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 173808, 512, 2, 32, 2, 2, 0, 0, true, false, false, false, false, false, "4d6d867fda7beeb114c48d7a44730fa2d375ea5538e541cceb2cdd06670a6e5f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 64752, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, false, "2d23e20d1dce1e7574714db0014dd79cb4617a7227acce9d3388d87823423a42"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP32VarSeqQ128Kv128StaticContext", 64656, 512, 2, 32, 1, 0, 0, 0, false, false, false, false, false, false, "b9e15a3f367acfcd31e083a6eccf1a0a704f21291218bc3373a440f34646cf47"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 158840, 512, 2, 32, 1, 2, 0, 3, true, false, false, false, false, false, "cb068067694930592c5006389705dce553e736db71ecc13f4c7cf2253dedbcf5"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 155384, 512, 2, 32, 1, 2, 0, 3, true, false, false, false, false, false, "c7ebaadf080b8084ac213d5c9cd9c94c63b84161af66a05241e5461c1b0c30b7"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 124016, 512, 2, 32, 1, 2, 0, 1, true, false, false, false, false, false, "aba51f01feb74419611dd664f15e7123d98490537f85cd516ca75f9f4735dc3a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 120560, 512, 2, 32, 1, 2, 0, 1, true, false, false, false, false, false, "ec300df0dede0140ead05fa94223d76682a998d3c1f8ebf3251ed88530fc491c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 64752, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, false, "ae724ae78bfc5928c1cd99d78303dde4ab601ad0f4abf6d0dab2a9b046fa1a6f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ128Kv128StaticContext", 64656, 512, 2, 32, 0, 0, 0, 0, false, false, false, false, false, false, "bd6be49278c64291dcc498089295445f7f9efcca65d8ab64ceb1bf48fa63c5f7"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 125136, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, false, "5002dbc054c53bde7faffb9b0652ebd2b7f3c2ca578ec458404672c1d1df20c0"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 124016, 512, 2, 32, 1, 2, 0, 0, true, false, false, false, false, false, "2e662c31013bdf7445a2757078079a81adec763b2520f2c7c24bbb18c8cef6c5"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 121168, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, false, "3e1645b0fa1e896899b300397454654d8e3ba35e523564eb1c4ec53547221e46"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 120560, 512, 2, 32, 1, 2, 0, 0, true, false, false, false, false, false, "7ce7d65c7c2e9ab7392591451fbc66aeea21da83a572775d6380cc83052afe9d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 158840, 512, 2, 32, 2, 2, 0, 3, true, false, false, false, false, false, "03b4c6a5072d4355c395f664c1511a34a1f4382eec40a059f723ce45b7ed208a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 155384, 512, 2, 32, 2, 2, 0, 3, true, false, false, false, false, false, "f499d402eecd24c727a3434b863b29160cd85ca801a6dd8d3601cbd7128e3853"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 124016, 512, 2, 32, 2, 2, 0, 1, true, false, false, false, false, false, "7c5fa94e794507670c6da478a2376cc7451358863ecbe188d367f8c8c1f8f29f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 120560, 512, 2, 32, 2, 2, 0, 1, true, false, false, false, false, false, "7152b5ca9213f995f059c211899bc061a7e5872bad43f73d61e4710a2d022540"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 64752, 512, 2, 32, 2, 0, 1, 0, false, false, false, false, false, false, "69f00566de5cad1f1c6b5d64e77ffcf075545c87ff01c1ba114874307d6ad9d0"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 64656, 512, 2, 32, 2, 0, 0, 0, false, false, false, false, false, false, "fd45a792b7d4ad316e835f0b1317cf057c572146a581c1d316e8147d4d58985e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 125136, 512, 2, 32, 2, 2, 1, 0, true, false, false, false, false, false, "4fb7f7ce4742819a886adb71752ef7e2f1d854b7ea0db5b7ef526df73addfb3b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 124016, 512, 2, 32, 2, 2, 0, 0, true, false, false, false, false, false, "fc736c15746148cc2682e2397877b0a3ba573d87f4ad8462390ab35d8df54c7e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 121168, 512, 2, 32, 2, 2, 1, 0, true, false, false, false, false, false, "89d78e714ee3640afd562da84c98d4d3cf07b60a2fe20068e3fc6db56f59cc23"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 120560, 512, 2, 32, 2, 2, 0, 0, true, false, false, false, false, false, "ec601cc6506026a382fa7c95ba8fa0a0ff65a4624632d5925d004d3ae6d2e4e3"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext", 164208, 512, 1, 0, 1, 0, 1, 0, false, false, false, false, false, false, "9847648eeaf7c54c667c52fa71f9394e3904dd3a98cd3c3aed9d3fa7f483ff67"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen", 163248, 512, 2, 32, 2, 2, 1, 0, true, false, false, false, false, true, "05ae607700ad312a6b2d52139879c9c7189436571ac5e6deac64513bc4ae24dd"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 161024, 384, 2, 32, 2, 2, 0, 0, true, false, false, false, false, true, "548d084487d2d6b83195efec7040346b850bda683805d3e74eab0146661e0bfc"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen", 156464, 512, 2, 32, 2, 2, 1, 0, true, false, false, false, false, true, "38e4e31a107b3e20bd4cdff92a649fc1f3dd8d610f64c7b277456abd888c0d66"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 155264, 384, 2, 32, 2, 2, 0, 0, true, false, false, false, false, true, "d7dfebded9ae12278e0fa47163756660c7bd08b8c56e4d334fd4c3a1b2e0f01b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 127296, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, false, "9c0b6df16768698f28c3a9f0db67085388e006c8dbbaaeb4a08fba254965026f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP32VarSeqQ128Kv128StaticContext", 127120, 512, 2, 32, 1, 0, 0, 0, false, false, false, false, false, false, "e86fb327001135cb37d69100317a8df06336a95e2cb78e49d32d5cfb1a21dc88"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 200808, 512, 2, 32, 1, 2, 0, 3, true, false, false, false, false, false, "fd81e264d5f9e6269f973cb96cb84c98b8493e1ad4fc2edb076bffcd057fdf57"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 196344, 512, 2, 32, 1, 2, 0, 3, true, false, false, false, false, false, "57aa5ac9effc4b71e14b35ba57f0064d3e57939ec89e96a3ed17a7573fd58c9d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 167008, 512, 2, 32, 1, 2, 0, 1, true, false, false, false, false, false, "9e4caf9b3353d24dda5edaf0965f58b32ae24a3b6bcc66f2c7601b60886ef681"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 162544, 512, 2, 32, 1, 2, 0, 1, true, false, false, false, false, false, "5850d52974268fb7c942fd8f6b06eebbba070ac0b20cce96e822bbee67d26d36"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 127296, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, false, "3e0ad50623cff5c56348c857fd7ac2eb1e2e163ee633be646daeb414f9d04a23"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ128Kv128StaticContext", 127120, 512, 2, 32, 0, 0, 0, 0, false, false, false, false, false, false, "8bda9621c1ff23192f9161ed6507ea3316765c65da0e9e3f90bd2d8eb0ed8472"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 169232, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, false, "a869eb76fc015640cdb6ef52ed592086a5dd1b2fd35346eea2ab1aff0cdfc5d8"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 167008, 512, 2, 32, 1, 2, 0, 0, true, false, false, false, false, false, "640c351c650a028166e010b2f2b8823f1397812efa4b30b97f8a47a0815acd5b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 163744, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, false, "f24bd3d8935ebf0156ba3a679aaa7cfb7441e1abf86f0c90b182b65818d7f6ec"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 162544, 512, 2, 32, 1, 2, 0, 0, true, false, false, false, false, false, "5cc0ef7064d4243b71504c297967fa3c0e4dd0d98f147f9622da2af3a7c747c6"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 200808, 512, 2, 32, 2, 2, 0, 3, true, false, false, false, false, false, "45c68b17a4589540f3375c99a1d6ce2dabd4a8f9ed5e28cec43bd4c13c34c56b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 196344, 512, 2, 32, 2, 2, 0, 3, true, false, false, false, false, false, "1ec165b86fa691fe154a189a84e316c255dbf38a2cb9e3b70a06b69c99f24152"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 167008, 512, 2, 32, 2, 2, 0, 1, true, false, false, false, false, false, "2bf04521428c93ebae3cdebae5e430176cb2b58ea76f4fa16a3a43cc96c5427d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 162544, 512, 2, 32, 2, 2, 0, 1, true, false, false, false, false, false, "d68a0e02b016b07e79e80a3d583162ac9d92507667260619c68424c4e3e72cdd"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 127296, 512, 2, 32, 2, 0, 1, 0, false, false, false, false, false, false, "768f4616c385fa05288c620f65dcc7635c01250ed2c54f3675f3b99fd0003593"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 127120, 512, 2, 32, 2, 0, 0, 0, false, false, false, false, false, false, "f66dadbb60397935a2d6fda16a678f1579593f0a10cda2b82c41aab825b666d8"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 169232, 512, 2, 32, 2, 2, 1, 0, true, false, false, false, false, false, "e18f6526457943f2fc30ced2694617ee6f619ff10d7da1f351abcc521861d1f8"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 167008, 512, 2, 32, 2, 2, 0, 0, true, false, false, false, false, false, "0028961c22fd9182588a05ab31af7586fee2f9d143cd1a1ad4a765eef36101c8"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 163744, 512, 2, 32, 2, 2, 1, 0, true, false, false, false, false, false, "fc2082aaf436c915185d42fc5cb544550fd0c9bf22cc15d31d7f9ad96ad68a68"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 162544, 512, 2, 32, 2, 2, 0, 0, true, false, false, false, false, false, "f7e621ff6ba7c771ebff25d6ecdfd8210b8de6330357d722676f844ead15327d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 224656, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, false, "766996e62164b7c080de4fd6996e6c623c3117cc08945950d97c6d8829731ccd"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP32VarSeqQ128Kv128StaticContext", 224480, 512, 2, 32, 1, 0, 0, 0, false, false, false, false, false, false, "4efab4dd59bd4001cb6e6a7f3928000beaab2662d88ef9d02fed2a27f7a8525b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 213608, 512, 2, 32, 1, 2, 0, 3, true, false, false, false, false, false, "8a022eeaaa08bc5f8c53ff41f0f2345419616881681602c492c57fb3faad18f7"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 207096, 512, 2, 32, 1, 2, 0, 3, true, false, false, false, false, false, "57948fae4e1bf88aff0962dc3e6ce48af9e3f98e1cf33c41d9bab44d083b1616"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 180320, 512, 2, 32, 1, 2, 0, 1, true, false, false, false, false, false, "3500da9a3ed2f9b67baecf08bdc85a8215eeff63b842beabd548a1dbb9becc31"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 173808, 512, 2, 32, 1, 2, 0, 1, true, false, false, false, false, false, "6ff78f1484fd60c8e119d5772041d78679bdab23659cb0b2d29827ef156dfccf"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 224656, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, false, "f4192910102b55d904069aa27ef9f1716efd1ff1fe633afb085f1955c22f1cd5"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ128Kv128StaticContext", 224480, 512, 2, 32, 0, 0, 0, 0, false, false, false, false, false, false, "9aaff75c8ef227802ff983c586865c42a2b1249a52e5cb299ba3a39a71d5f5d5"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 182544, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, false, "bcb95d409b4706952d4b1bcaf73da9a9b603b4020bdfd0fad7da735128c07faf"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 180320, 512, 2, 32, 1, 2, 0, 0, true, false, false, false, false, false, "a1996b4ec2b85d11282c0eb4d15312869648791b6ccfb271ff0d2ec1d6a641af"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 175008, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, false, "f8f32e71f17d3f40b7d7e1433915d34996ced8e358fe62f7746baad56aeeaaaf"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 173808, 512, 2, 32, 1, 2, 0, 0, true, false, false, false, false, false, "1cba198fc92c8f9b62ba7caaf5db26fa2faecf2787cc1bab8809e19112d59a8c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 213608, 512, 2, 32, 2, 2, 0, 3, true, false, false, false, false, false, "e855b1ee23b764a054b9bedd1805e98c52c029caed169b818da467ea3e5cbcbc"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 207096, 512, 2, 32, 2, 2, 0, 3, true, false, false, false, false, false, "91cd0c2bf6bdaf4acf9b695d7a59befc76a89b0695f39c32038508247f9af16f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 180320, 512, 2, 32, 2, 2, 0, 1, true, false, false, false, false, false, "c185ecdab5c133660931eb573775706d4f3c931055e37f9f5f97a775f8c26bbd"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 173808, 512, 2, 32, 2, 2, 0, 1, true, false, false, false, false, false, "d8983b7cd3524e57d7f9964ffb1171ee7090bb005830aebed973399c5e876fa2"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 224656, 512, 2, 32, 2, 0, 1, 0, false, false, false, false, false, false, "7324ac7f27f909d6b3da368ef6d7183827a7b8ab7755352580456da952a5bf36"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 224480, 512, 2, 32, 2, 0, 0, 0, false, false, false, false, false, false, "c65cf58df652364dc9456c16bdcdaa49c5141d2c3591d8f35340f49669924b5f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 182544, 512, 2, 32, 2, 2, 1, 0, true, false, false, false, false, false, "d2be1ad3c5c5b2b28c1138f01dc5491526f30884cd5fda2cc60f6791ef98f4bb"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 180320, 512, 2, 32, 2, 2, 0, 0, true, false, false, false, false, false, "de775b413a6bffc8e202f99376ab75d9ee453e8bf8c048a8e6916f7d676ec869"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 175008, 512, 2, 32, 2, 2, 1, 0, true, false, false, false, false, false, "ac48190844ea9ecfea5d50b585e1c824c634c8a23b93f544b87f6f51b96077be"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 173808, 512, 2, 32, 2, 2, 0, 0, true, false, false, false, false, false, "a49829588c4ff8a0af8f4b21f0b793a06652c40e67a687b06f29b6b4798e0074"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 64832, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, false, "61cce992dfe23d624b786bce6abaffd0d50567dd706439770e8c3cb25e4a8470"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP32VarSeqQ128Kv128StaticContext", 64656, 512, 2, 32, 1, 0, 0, 0, false, false, false, false, false, false, "c4c69042ae0da50370e3409cc6ce98cd55664492098dabb12c34144136242ee8"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 158840, 512, 2, 32, 1, 2, 0, 3, true, false, false, false, false, false, "b06074d4e8f88a448ef0de2383c9483315e8699c043c571f04ef65ace414cea7"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 155384, 512, 2, 32, 1, 2, 0, 3, true, false, false, false, false, false, "305b2b2cbc4673e2ae144bf60b4774a9fafffb26d8b7188123cbb87452cc2cbe"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 124016, 512, 2, 32, 1, 2, 0, 1, true, false, false, false, false, false, "580364fcb8de0a50aaf63d6973ffb410002009032c5c2acdd99d5030b9e3baa1"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 120560, 512, 2, 32, 1, 2, 0, 1, true, false, false, false, false, false, "8d7a63923f9ce5c6679385d4557728b711e9c70c9dc9b135600aca78e32e1716"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 64832, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, false, "9a34125d39a8236c4e38b42fa7a037c620ea24e52a80c07dd14d658d685bc9e1"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ128Kv128StaticContext", 64656, 512, 2, 32, 0, 0, 0, 0, false, false, false, false, false, false, "68af8d91c5039658626eaa512c7febccaeb5f530a76f2c250d4769c16ee5477e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 125216, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, false, "fd59d419cd10e8457d9db5f4096feaebcdad99c94b7c54561298e4253feec341"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 124016, 512, 2, 32, 1, 2, 0, 0, true, false, false, false, false, false, "7e3624bb50bd2a1f234f7bf8e9ab7b022d6762ed774cbfba47e055285d0f63e1"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 121248, 512, 2, 32, 1, 2, 1, 0, true, false, false, false, false, false, "72d7c6b994c5da9e823c835bb1c5fcf41fccf0284219430c6724fe46f89f838b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 120560, 512, 2, 32, 1, 2, 0, 0, true, false, false, false, false, false, "4f4cb5f277327da0409911419d35412e8b00ca1b386e68505bcb60896049d716"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 158840, 512, 2, 32, 2, 2, 0, 3, true, false, false, false, false, false, "8bd4d7736c1d2d63c38acccd9663589b633bd1985161b81c77e29786bfd971a4"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 155384, 512, 2, 32, 2, 2, 0, 3, true, false, false, false, false, false, "3168a502b3de28a40a257401b4ea1a631c15a9222702192683006671a57f79f7"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 124016, 512, 2, 32, 2, 2, 0, 1, true, false, false, false, false, false, "7a0dceb04653792fb2dfe42a67dea212414bef94d7f0020d643620e146ae42a0"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 120560, 512, 2, 32, 2, 2, 0, 1, true, false, false, false, false, false, "b3b98677423d723069e30c67d8d53f65501b6a76309c8dd998eb6203e16c55ad"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 64832, 512, 2, 32, 2, 0, 1, 0, false, false, false, false, false, false, "cecdd21114538e9b4999b1fde74dbdd5f97c63ae82f1c2ec2b0bc09ca1e31105"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 64656, 512, 2, 32, 2, 0, 0, 0, false, false, false, false, false, false, "eddf16763a0b71f7136b3ce857e87427a5ecdc5a55e6b8b854d981cd02598712"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 125216, 512, 2, 32, 2, 2, 1, 0, true, false, false, false, false, false, "5dd945f7a8d954d0255d54ab109bd3e675e4a531454f61191b5dd377c3d311fa"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 124016, 512, 2, 32, 2, 2, 0, 0, true, false, false, false, false, false, "96d109574da8b662f221677c9c79ff61512b0df9f1356abf9755a1468eca8507"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 121248, 512, 2, 32, 2, 2, 1, 0, true, false, false, false, false, false, "8979ca0365b8252df63e232a80939fb372b70f5fd55f9f25a70c7aefe2d38a07"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 120560, 512, 2, 32, 2, 2, 0, 0, true, false, false, false, false, false, "b298743a58a09e13074988274fa2d49f5b58fcddbeba4e764571409b32cd988f"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext", 164288, 512, 1, 0, 1, 0, 1, 0, false, false, false, false, false, false, "55546e440e2b190d78d2d7ea7f8ce33652d0a51c410d9546aae05419e1cb5930"}, { DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqQ128Kv128StaticContext", 164112, 512, 1, 0, 1, 0, 0, 0, false, false, false, false, false, false, "3cbf2a3fc44ab92c53d959ca98ecd35c34e0e1becb90a5de5e07729ed8763c28"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 164224, 512, 1, 0, 1, 0, 1, 0, false, false, false, false, false, true, "169addeb3e6894227153743878394e721cfb64ad59af3b8b597755ea635b8e7d"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 164304, 512, 1, 0, 1, 0, 1, 0, false, false, false, false, false, true, "85a395d54235678965c767c8681c98fa59d7719f6ae757a374b48d1b0c745bb2"}, { DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 164128, 512, 1, 0, 1, 0, 0, 0, false, false, false, false, false, true, "5f79ec634daa6332f076038ac3a8699f9cf492949115c309a5e44747b35c2a36"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext", 164208, 512, 1, 0, 0, 0, 1, 0, false, false, false, false, false, false, "ea8d1876266f58809ce17075e9fc1c1872fb10015a192ce5a50acf2c2019771b"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext", 164288, 512, 1, 0, 0, 0, 1, 0, false, false, false, false, false, false, "5f0681376c66d567ae0c5d8244f2ee97a45b9f3c2ff3acd27f3c2b1d1c91d24b"}, { DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqQ128Kv128StaticContext", 164112, 512, 1, 0, 0, 0, 0, 0, false, false, false, false, false, false, "7b368898f39e3c5883593197c6630b23ca9c51d0b53fe3a3f497b1c578e44cf6"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 164224, 512, 1, 0, 0, 0, 1, 0, false, false, false, false, false, true, "59be195aae6e47587e54d2b971fb2cde0cd56aceca46021e6806d3f9416bf5ba"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 164304, 512, 1, 0, 0, 0, 1, 0, false, false, false, false, false, true, "e879bc7a95c8ab4aa89e558dbf32eb83efe85f55fd0651b15f451ecafbb13139"}, { DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext", 164128, 512, 1, 0, 0, 0, 0, 0, false, false, false, false, false, true, "6c08c093739d9d3b2a9b8e11bf54909e48b69d821509708be45ffb1406faf0b7"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 164208, 512, 1, 0, 2, 0, 1, 0, false, false, false, false, false, false, "4d5459d4ddd795cc91b7b7257758349370a40164a77643ef36c5e80518c71c0b"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 164288, 512, 1, 0, 2, 0, 1, 0, false, false, false, false, false, false, "6d7d6dff2cff2c74709d55342f4a62617643a4b039e2ae17a46cdbb444ab583d"}, { DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 164112, 512, 1, 0, 2, 0, 0, 0, false, false, false, false, false, false, "49174ae398e81d0f37f4a8132f5d1a02c0fe2c56f68eed8fadb106dec8a38557"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 164224, 512, 1, 0, 2, 0, 1, 0, false, false, false, false, false, true, "b7f236f0e565c1b3052a734225f643417582783e63adbc02bfcaa428e942f924"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 164304, 512, 1, 0, 2, 0, 1, 0, false, false, false, false, false, true, "9fcdf81b688241df3a37a1e06973449ec486448468255b4ca11c9b28ad71b33b"}, { DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 164128, 512, 1, 0, 2, 0, 0, 0, false, false, false, false, false, true, "9194643bb3801aa297b3274b6210429f3f759d19cd055e7518cd500669ee0778"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 165072, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, false, "cf90c1e7c8cadbcd89a1a14c57b5f52412fcd9705b6f9d77f22f2fdae39c197f"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 165152, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, false, "32145a94cf262f851d1bc3cea8332b820cd1f84cd01197349e2f587e9dac371f"}, { DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext", 164976, 512, 2, 32, 1, 0, 0, 0, false, false, false, false, false, false, "4cb583212a2d156c187a1b30f5a6e996439b566a41337b21775192fb0fa54f02"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 165088, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, true, "724a64faa47de7a7ee514322fc6ae98a0fbb248c3aaad1df0934f8f0cdc11ed2"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 165168, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, true, "52f7e3e915a854e90a987b98648f21d2c4eb0d3c641abbe46f65a177ae1beab0"}, { DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 164992, 512, 2, 32, 1, 0, 0, 0, false, false, false, false, false, true, "3e2d6a17a58c5c7a11f584736b2513edcc66426ecfc51c8560d974bae95f3ca3"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 165072, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, false, "bea0c105ae0b5fbe0f86df2359a43e6d63e2c6201263c36338fe3577443a2e3a"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 165152, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, false, "f8dd5d04ed444aa0e8c2a68ca3ab5e3264df2e807fc629f1344a7ddcd79f8d4f"}, { DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext", 164976, 512, 2, 32, 0, 0, 0, 0, false, false, false, false, false, false, "bb39559469781a48e2e4cbe1434e708352a55a22d707b5d071993218397082b7"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 165088, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, true, "1abcbbc3180911468df6c7f0461e446339d57e4f9ed63101d99178aa70246472"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 165168, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, true, "e80d416c0c031bba829dd38bc46fe2c11e89c473515bf96bd3a419286f218771"}, { DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 164992, 512, 2, 32, 0, 0, 0, 0, false, false, false, false, false, true, "2e426e0fbb87bac16d4d187604895f2e2a38bfc8bceeac3d3312ee0fde1d59c7"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 165072, 512, 2, 32, 2, 0, 1, 0, false, false, false, false, false, false, "36f890ab7ce30d7cd8e993323990f271bbb1d699d64c77c7900e944fa15e745b"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 165152, 512, 2, 32, 2, 0, 1, 0, false, false, false, false, false, false, "e9e91b3d795884fc23e8df4450f4dd1d3dfa30edf136250adf71470c65c5ac7b"}, { DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 164976, 512, 2, 32, 2, 0, 0, 0, false, false, false, false, false, false, "50d10365e16428749bd45af0c229ff54455c8cc4ce84e343038c2a52be851315"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 165088, 512, 2, 32, 2, 0, 1, 0, false, false, false, false, false, true, "15f456101c8f0eea45b2b902362017f8c5e00c3de16da18e516adcdd2084683c"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 165168, 512, 2, 32, 2, 0, 1, 0, false, false, false, false, false, true, "c4a20135a5f4f0e969633c2b6e680b4dd24a13e80ad3b1fa1f59a721a7f49026"}, { DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 164992, 512, 2, 32, 2, 0, 0, 0, false, false, false, false, false, true, "be77cd7703a5105a5172b2afd21bd3d1ecd19dcc232e1d58f387514be780cc27"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext", 196928, 384, 1, 0, 1, 0, 1, 0, false, false, false, false, false, false, "15d0a3355c350f5611f80b0861bf8e17d541f9f4a6828dfa80ca7c5fc74f352a"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext", 197008, 384, 1, 0, 1, 0, 1, 0, false, false, false, false, false, false, "5755e5ba7d56082a233a93deef4f0aceb3ae6f3547dac26440fab041f63fe60c"}, { DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqQ128Kv128StaticContext", 196832, 384, 1, 0, 1, 0, 0, 0, false, false, false, false, false, false, "50cc97a34500b8288840ee61634ce2f0152aeb3e3ab156469d53ca3a6d0e4a8e"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext", 196928, 384, 1, 0, 0, 0, 1, 0, false, false, false, false, false, false, "2c6e1445a40c750560dd4ac90c540893e4b51951817311b2ff67168d21a63cea"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 197024, 384, 1, 0, 1, 0, 1, 0, false, false, false, false, false, true, "2dcc7ad9702acfcde82a9dfab308fb0e76340e2cfe804b4af0ce156ba9c7d7a3"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 196848, 384, 1, 0, 1, 0, 0, 0, false, false, false, false, false, true, "9cea34a04d6059cb332bbc2e24535ffe0bce4abe8577a88c6c47e209ad561ddf"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext", 197008, 384, 1, 0, 0, 0, 1, 0, false, false, false, false, false, false, "7b43ca9aa3fafc851ba96992720285c569e2ca5ea053d5397771d1c019f3e5b2"}, { DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqQ128Kv128StaticContext", 196832, 384, 1, 0, 0, 0, 0, 0, false, false, false, false, false, false, "3851eeb53e64a932339fc017d28bb2c24738cf0289fd7f45d4f84a6e4a997d54"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 196928, 384, 1, 0, 2, 0, 1, 0, false, false, false, false, false, false, "f83aa643b4703797448a1a2fb1cebf1cedbc0810b823dc90aa8f4dad62189504"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 197024, 384, 1, 0, 0, 0, 1, 0, false, false, false, false, false, true, "0bb1f0cc855408c692fdffa133db176a783dc48e31f07f2b1670f4517d4fd71d"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext", 196848, 384, 1, 0, 0, 0, 0, 0, false, false, false, false, false, true, "2e7c5e250107537b3c1efeeb1f7b99b4673185e8bb1769a47f06da41c24f198b"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 197008, 384, 1, 0, 2, 0, 1, 0, false, false, false, false, false, false, "c0e8467b0c8e6ea8eb63afda171c8846a3493cc65cf23288a12a36c243572b10"}, { DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 196832, 384, 1, 0, 2, 0, 0, 0, false, false, false, false, false, false, "92c2dd4c4b311d679781f4b86b971846171d518c4e41f6b498db96566b5cb5f1"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 197792, 384, 2, 32, 1, 0, 1, 0, false, false, false, false, false, false, "b47d02bd7d479a4988a5b18a0c406e846aeb6d4aa6a32edc94d5e27d051a53d0"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 197024, 384, 1, 0, 2, 0, 1, 0, false, false, false, false, false, true, "8dd72dc93a716963508b8cfdc0bc05ec87e5af068c35260061b1980d3a0a0a05"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 196848, 384, 1, 0, 2, 0, 0, 0, false, false, false, false, false, true, "5479e9536c902ad7f7f203163181debbffd581d0d511bc610bf57ce0334ad470"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 197872, 384, 2, 32, 1, 0, 1, 0, false, false, false, false, false, false, "f9277b1f1dd44eaf1171048373dc98f430f04ed1f53d17b5e9ca087ec075ce6f"}, { DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext", 197696, 384, 2, 32, 1, 0, 0, 0, false, false, false, false, false, false, "9095978c53e416b019ee2fd5eb19f6a78cf91f8f727ee1cbbc7101692e852185"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 197792, 384, 2, 32, 0, 0, 1, 0, false, false, false, false, false, false, "69c316ba40ad19e4d73bab3a8072fb2740f0ecdba59f403ef33e88882d396572"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 197888, 384, 2, 32, 1, 0, 1, 0, false, false, false, false, false, true, "ef81058a93723f4c910f59fcdba7a1818dcd8e2259bb70f222be127abcd992ac"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 197712, 384, 2, 32, 1, 0, 0, 0, false, false, false, false, false, true, "7bc7db17bd1c4639ec10bdb770eb90223b7dcb4f811f5484672c68cbb34c061f"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 197872, 384, 2, 32, 0, 0, 1, 0, false, false, false, false, false, false, "88a23a11bd865eb2fb871780bb8d9f89fab93cd53f974bb4e1875a8b543c6742"}, { DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext", 197696, 384, 2, 32, 0, 0, 0, 0, false, false, false, false, false, false, "8ed0dcb6b650553bc7616ae5700e1258a2b7106e6d5a4f104b7625bd38c52471"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 197792, 384, 2, 32, 2, 0, 1, 0, false, false, false, false, false, false, "2008d8f607b0f74422c20bf7b22921bcfba2c2f8ea53ca44d25e110ddb0c81ce"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 197888, 384, 2, 32, 0, 0, 1, 0, false, false, false, false, false, true, "58bd180b2fdfa6e7cf5f5125db636af92974e133fefd4297f1124f5f04d2f001"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 197712, 384, 2, 32, 0, 0, 0, 0, false, false, false, false, false, true, "d5d3265b2ca043e99ffda31e33d7c3cb37a48a6ccbeea1da0c47ccd3b53e83a6"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 197872, 384, 2, 32, 2, 0, 1, 0, false, false, false, false, false, false, "9280b6b16f623a46c8f895ef022a08d084e3ff5ef237806d0d4903f7625fbe74"}, { DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 197696, 384, 2, 32, 2, 0, 0, 0, false, false, false, false, false, false, "61ffbe1dcace7e76efedbe1fa37938371677f6c227d31adae72adbb78328ac9f"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 1, 0, 1, 0, false, false, false, false, false, false, "0145e90336fea5c01dc513bb37fe91d5593ab5f8c13051e3690a8ee6bc7984aa"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 197888, 384, 2, 32, 2, 0, 1, 0, false, false, false, false, false, true, "455c0ba452172c87a10dbe29e851c184b88b705ce3ee3c30f125b6a23985736e"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 197712, 384, 2, 32, 2, 0, 0, 0, false, false, false, false, false, true, "e459204384d8b7a8216c6840f9c82cc370b948901da967408fac0131f282d8f6"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext", 82336, 512, 1, 0, 1, 0, 1, 0, false, false, false, false, false, false, "9a14bdac83ecc92701e8ae86aebe51fa34c28dec7589f1b089e21aba749c31a1"}, { DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 1, 0, 0, 0, false, false, false, false, false, false, "9b68068950cc8e00c11407eece628b689eb6c576515789c2a35d40d8cb7849cf"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 82272, 512, 1, 0, 1, 0, 1, 0, false, false, false, false, false, true, "411a5be5eac5a67e3e1cfc293dadc076b68a80e947cb2352c251849ad43388fc"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 82352, 512, 1, 0, 1, 0, 1, 0, false, false, false, false, false, true, "a8d0ee6be1323762d662e678ef76eae9316df74028687acf22f1889539913a49"}, { DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 82176, 512, 1, 0, 1, 0, 0, 0, false, false, false, false, false, true, "2db89676426544d8cf3f25da02d44773e8ebbefc723318220cd9c612c2806f54"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 0, 0, 1, 0, false, false, false, false, false, false, "9694e42bfc56da8c2268944dea8763422acde270208997938ecfb4373f165876"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext", 82336, 512, 1, 0, 0, 0, 1, 0, false, false, false, false, false, false, "908a97c750c404b737bca8f9f43a4fa032982cd5e6afcaaff06a2c254cba048a"}, { DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 0, 0, 0, 0, false, false, false, false, false, false, "6ddd1eb0a6ed5ded94a3dccfb3e7238a2e7445931e083c143b4e5829fba69c50"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 82272, 512, 1, 0, 0, 0, 1, 0, false, false, false, false, false, true, "4319c2c82fd56d54c0203a8175a5721c868a335b443933596deba41fa1777bc3"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 82352, 512, 1, 0, 0, 0, 1, 0, false, false, false, false, false, true, "f5658c6375bf3dae123d8f17c12c25f777ac9ce5e18b6643b60bb3ad4252495b"}, { DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext", 82176, 512, 1, 0, 0, 0, 0, 0, false, false, false, false, false, true, "6c62c59c7f7e9cd5b94db1768929652f9e5435a37cbd8475fb3870efc22c2a86"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 2, 0, 1, 0, false, false, false, false, false, false, "aae61f0d5209d1068f4c5d7542cd25a1e97dd74db5364ac18eb5da3eba1b8e7e"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 82336, 512, 1, 0, 2, 0, 1, 0, false, false, false, false, false, false, "1e56c4ee8e4687f73b32a69849873f551e84994d61b58d0d062f34db6dda6e22"}, { DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 2, 0, 0, 0, false, false, false, false, false, false, "156fad272b42842b138c80deebec9ad571ddf56a611948f9940f00f1b6b6079f"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 82272, 512, 1, 0, 2, 0, 1, 0, false, false, false, false, false, true, "546183b490431444b8476e2719e3bee06168a8e72d14f9b2c93a2a6f03fa0edf"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 82352, 512, 1, 0, 2, 0, 1, 0, false, false, false, false, false, true, "393bfc6aedce2b00a55f7571906a34bf30c2d2f41e074b32c2570a3c0d08bba3"}, { DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 82176, 512, 1, 0, 2, 0, 0, 0, false, false, false, false, false, true, "d833f60ad45aa56e1fe4b8aac7b30c1d57c33be59cfda8c1ed35f4ff50add44e"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, false, "da7dc9a719e2df256595ffe2d63300c581b2378024aade1b86309ec4846eb9f5"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 83200, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, false, "4b89b02845494fc4fcb569df9f91306f03b1a98f40ad5885375bb2d7db43d1b7"}, { DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 1, 0, 0, 0, false, false, false, false, false, false, "fefbdfea7a70e47d89ae85527bc1aa7e970eb7039b495f60d3496d3fb3a65002"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 83136, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, true, "d588565b09fa80749365cc77b0ddc3251c83a0d52bbb157bcb460bf2dbc2a945"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 83216, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, true, "9652db3d622e050d61b81aea2e02f026ea0a336fac0d332baa23c411136246f1"}, { DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 83040, 512, 2, 32, 1, 0, 0, 0, false, false, false, false, false, true, "0f5cd79f05f606d96937be88f1a6d914a97df2be0baf3222bed7b04f0632cae5"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, false, "0fd15863bf4ff9f1f97281d3390244997fb51d636f38c12a4f0c278ff3d7ec97"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 83200, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, false, "be33be26da0cf6a117a1dbb331dab37444d2c8ed64d8ec23c2bf0e1c539edee5"}, { DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 0, 0, 0, 0, false, false, false, false, false, false, "d02f9443bde716f251c79c94646ab5d0c07d0b383cf2144bf2a867ae8c4ff685"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 83136, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, true, "519521ac7807326d8f0f91ec4a9b69d595fc4d27aa0f02196e1b9eec7350926d"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 83216, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, true, "a97e1e73762e95290bdc06c280a0598eb191203705c97508b186b3c86f5c47a7"}, { DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 83040, 512, 2, 32, 0, 0, 0, 0, false, false, false, false, false, true, "05b3f0cb8aa0ca8c01dce224ee99e1dc9c67c2c9d291629b7147059abfebe155"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 2, 0, 1, 0, false, false, false, false, false, false, "9d145bc4f96f142de7762931e3aa91c828fffb12a91431856a8ba35ee1890a2b"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 83200, 512, 2, 32, 2, 0, 1, 0, false, false, false, false, false, false, "37103e126823475781e10d3f9ce89c75eaaed4e66ac2167fde94d23da7f7b095"}, { DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 2, 0, 0, 0, false, false, false, false, false, false, "398fdaabd95b0a731f4d528571a3c9d82cf2130097213cf4d8963b5b9e2ff421"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 83136, 512, 2, 32, 2, 0, 1, 0, false, false, false, false, false, true, "85c4b60e876977a96b67cab6ddf21b2a9ab46a9f9f3604118dd9ae880027bd6c"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 83216, 512, 2, 32, 2, 0, 1, 0, false, false, false, false, false, true, "2fe86a385e047f192ee0e51702c106cf2433cae75372638fe3a905235e3c31ea"}, { DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 83040, 512, 2, 32, 2, 0, 0, 0, false, false, false, false, false, true, "9aa8eb616d14f349639ebd66438ebf2331e12852cefb62f6553af7418d461670"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext", 196976, 512, 1, 0, 1, 0, 1, 0, false, false, false, false, false, false, "2c128f8367369adf4ff8f16e3eabf3b0f95fefd10e0c1d03e903b3cd07edbd34"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext", 196880, 512, 1, 0, 1, 0, 0, 0, false, false, false, false, false, false, "1ac74191641fb4a1972c803a0e7cf882932b02089ae9fb8de93d6e5d88854cb6"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext", 196976, 512, 1, 0, 0, 0, 1, 0, false, false, false, false, false, false, "212e66e277a728563e401b0a3d6e4ab715c395c7864582deb7f5caf34084e650"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext", 197056, 512, 1, 0, 1, 0, 1, 0, false, false, false, false, false, false, "0ab0bacbba32b61094662ab57e572f6c46a97776174709189c2098d7aa4d6e6b"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext", 196880, 512, 1, 0, 1, 0, 0, 0, false, false, false, false, false, false, "720d3f1ea9f41a8fc66df628683b43f10a4797e5a902c0943e6cb2524df93d1d"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 197072, 512, 1, 0, 1, 0, 1, 0, false, false, false, false, false, true, "bdb4720a4cfb0b28f05df37b2e62b2e4bdd17c95ab267a72739b76234096e7b9"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 196896, 512, 1, 0, 1, 0, 0, 0, false, false, false, false, false, true, "2f2cc6d337e67913a837f4ffd4316038fa04790802cf7b9ec9c911504b1e4c5b"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext", 197056, 512, 1, 0, 0, 0, 1, 0, false, false, false, false, false, false, "952a659ea27d4bd51c88af5052c3694f63166a2e8e91e4b734e300383008e97b"}, { DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128StaticContext", 196880, 512, 1, 0, 0, 0, 0, 0, false, false, false, false, false, false, "d908da8e3f6091a3800df8c3fbf5b9039b17247d283a74b40ad52c2bc73bc3d2"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 197840, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, false, "f6e0fbb212c14caad8c7263d66754542f6e641993daddbedb6fcd20c46609126"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 197072, 512, 1, 0, 0, 0, 1, 0, false, false, false, false, false, true, "73c7af5fcb9ab3d3600e974fcb74b751a3a120a6432c5f99641fb3d2e73ad35f"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext", 196896, 512, 1, 0, 0, 0, 0, 0, false, false, false, false, false, true, "6958c6e389d91d55bfd575bfdc8efa9aa1ace677c32a26c6b62432761e3e331f"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 197920, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, false, "74b6e6785267f44d80e2bd44a6c79ddf7142a4e6e9a8e76811c7548355448748"}, { DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128StaticContext", 197744, 512, 2, 32, 1, 0, 0, 0, false, false, false, false, false, false, "891be1d62e3784e731e874ba38b73ff7abfb09a4dfc4e6d49ce58a2fc68c61c5"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 197840, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, false, "bb86162f035a31399619a974d31002e5ba255e8419ae22fc20469ebd734e9f7d"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 197936, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, true, "49a47c557e99073c586f90131e3417a037e43f7bcf15d1d4f1df517e59925b80"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 197760, 512, 2, 32, 1, 0, 0, 0, false, false, false, false, false, true, "75128fe47df03d9ed82415785f9a0a9851fa5d5d23832b88d23c9a91ee75edc0"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 197920, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, false, "7f6cff45e10b0eb54aa918201a9187adbbeb9c0ceb3724309198c6d16c25d52d"}, { DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128StaticContext", 197744, 512, 2, 32, 0, 0, 0, 0, false, false, false, false, false, false, "b8626f66319ec12466d6f1fac3615baf094999e20b66a00c889f017676292e1a"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext", 196976, 512, 0, 0, 1, 0, 1, 0, false, false, false, false, false, false, "08496d90bc6691e3a69dd52628922fd6feec7e86267afff8bc22cbcc543fb6d2"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext", 196880, 512, 0, 0, 1, 0, 0, 0, false, false, false, false, false, false, "8ce0f01e176d58424385f3b820cc948f61716ad503b332009b06959a64bd9a80"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext", 196976, 512, 0, 0, 0, 0, 1, 0, false, false, false, false, false, false, "3a25f25867e625f35f7ba4486418ab2a1e190f1b9b1deb4897f38603e481d94d"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 197936, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, true, "d40dbb58f15eb798caa800b436d5bb71952c319bcd8988adc729afc5ace572bb"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 197760, 512, 2, 32, 0, 0, 0, 0, false, false, false, false, false, true, "3b2ad3d0d84130a29cb92b8546cf064acec752c898f2bc8242c6b080ecf48095"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext", 197056, 512, 0, 0, 1, 0, 1, 0, false, false, false, false, false, false, "488c88f44db777412ebf12e0804cfa98cfe7b7f5fc9d06d88e73e4e292ccffbf"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext", 196880, 512, 0, 0, 1, 0, 0, 0, false, false, false, false, false, false, "db3f2268befec853837daf3be93cf6a82272c991552aa751ef17928dcb055955"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 197072, 512, 0, 0, 1, 0, 1, 0, false, false, false, false, false, true, "2ff864e1cd81c494a86238eb130876f31b15a3a225d0c326ad822572c0592577"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 196896, 512, 0, 0, 1, 0, 0, 0, false, false, false, false, false, true, "c9041e1e106cdb5842c57d299da66e68ab02601552dc7514833d24b3afecc4c6"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext", 197056, 512, 0, 0, 0, 0, 1, 0, false, false, false, false, false, false, "db83ee691d475fa587a18dd9449ea2d6e7ac3d013df00fb3848f94f7cf75c5c2"}, { DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128StaticContext", 196880, 512, 0, 0, 0, 0, 0, 0, false, false, false, false, false, false, "9ba0c5ed4ba2d94f85900efaa5d3658b1611517d3e6c435fbf053a3ab6587b43"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 1, 0, 1, 0, false, false, false, false, false, false, "4cbb49914292e591c9c71ba9d5ba5a6ecb938a16d93a35b566cc91ae2c2c27a0"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 197072, 512, 0, 0, 0, 0, 1, 0, false, false, false, false, false, true, "b160465f34745ca4eee91954d9867d75c7d48a3b4f46690cf83ac6e86c8fc7be"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext", 196896, 512, 0, 0, 0, 0, 0, 0, false, false, false, false, false, true, "43f37f6b75d63a4a41cc016fe9e7f96c79f6de725ac2aa71b487de8e7fe4a8f3"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext", 82336, 512, 1, 0, 1, 0, 1, 0, false, false, false, false, false, false, "afa13529bd2826e2e5c38f2cf7c4c98550b426836fe092c92226f7d0e110cac6"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 1, 0, 0, 0, false, false, false, false, false, false, "7409f8570737b22ff4996208a52f3d30468dd3f75efee79a0d00dd5930adf8cd"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 82272, 512, 1, 0, 1, 0, 1, 0, false, false, false, false, false, true, "a1ad0563dd25b1d087a26f51f262ea7fe808c2a20ad8ef2c499dddca47485e39"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 82352, 512, 1, 0, 1, 0, 1, 0, false, false, false, false, false, true, "109dc90aad36301f925baa212614f20fc8df36590a961e7b35a90ab53ba3258a"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 82176, 512, 1, 0, 1, 0, 0, 0, false, false, false, false, false, true, "2c21b3a1ecf735aba7479b8bd4f4c4996aa3cc1321666a355c18369b2982e51d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 0, 0, 1, 0, false, false, false, false, false, false, "84771991e2cc8ffd5c56a877b1f8525b5f3c7572403ebb6129756548e275f29e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext", 82336, 512, 1, 0, 0, 0, 1, 0, false, false, false, false, false, false, "89fe3360483a7675c52fb98b89e31101cb77203eb615aaccb755946165b0841e"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 0, 0, 0, 0, false, false, false, false, false, false, "d2f8f9c494643622b2fe5b7c29d651394579196d320bc6273dbac99037c878c6"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 82272, 512, 1, 0, 0, 0, 1, 0, false, false, false, false, false, true, "c5176e13f0e0f0b672c56895360182d84942cbf11a38c8e0d1bb352ad656d012"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 82352, 512, 1, 0, 0, 0, 1, 0, false, false, false, false, false, true, "3c72c14700167f1435f08ed6495ce158e7619549696364c7142d2c83eb3ed6a9"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext", 82176, 512, 1, 0, 0, 0, 0, 0, false, false, false, false, false, true, "1731743cf81c0d3f024ece9a3edaf730c0c60fbfca161b8577453b95f4f7ff1b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 2, 0, 1, 0, false, false, false, false, false, false, "1cef7ed41cb5c1012a7166f512e9bc8957c94d42857ee0b46d92e4f4c13d3a72"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 82336, 512, 1, 0, 2, 0, 1, 0, false, false, false, false, false, false, "2b5c99cd3fda63582b7a9deee60deb879e116bf019f7f7ae29bcadc4e789953d"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 2, 0, 0, 0, false, false, false, false, false, false, "e90f5060721ca0429bbcb433dd9a8ecd09bae41b84449d92bddb2eb5b16aa1a5"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 82272, 512, 1, 0, 2, 0, 1, 0, false, false, false, false, false, true, "81762e57626de4042f6ebb8cd8cbb2ab845cc6c963c82d5df4533970b3106c32"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 82352, 512, 1, 0, 2, 0, 1, 0, false, false, false, false, false, true, "aba8cb84d3c2f21cacbf29a78a5122fb36737c7cf0d50ffd893738dab3be8cff"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 82176, 512, 1, 0, 2, 0, 0, 0, false, false, false, false, false, true, "6f5f3467f769947e80ba90d318ec8d3b6ae4b0ad702a01c5c3fdf6d62e0804d7"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, false, "c2f886ed440f239dbf425573810ac54b9813471f8f02f4d60e9618fc33bdd13b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 83200, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, false, "9c256548c6c2cb0d353634af77c985568fe67cf2989bef6a0a79998d2016401a"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 1, 0, 0, 0, false, false, false, false, false, false, "9ef4fe04d692459e24102a039af92ab7e5ccbe4c8f757a430170c99d47272f4e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 83136, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, true, "a6c840560dd025d9488c5bd871ea1b7f1391e558b86d422574baaf431aae0b18"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 83216, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, true, "1debee0c392b6c5a1bbb658d4c4feef8b2ec26b052321c0dbc61eecd6628a883"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 83040, 512, 2, 32, 1, 0, 0, 0, false, false, false, false, false, true, "2d5108364aee49fe9053dbd218176c115fd34ed4e294fdbcae59ffcd6eb33cec"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, false, "2b70c1651713d136beaf4e2e421c16c0372d1584478cf096bb0fc188a2799b51"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 83200, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, false, "2f24dca7f26452f3d97671ab5b05947ecbd23fd19d933c6642e519f857a1f433"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 0, 0, 0, 0, false, false, false, false, false, false, "cb028842edd317decb791c8a5f102ca64baa9b1b2d38ecff4933986cf66411f1"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 83136, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, true, "f8c21982a61c563a33be60b2159ec6c765b774fedc9135e905384f0edc039183"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 83216, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, true, "15162b80b56e090a4e98dfa10f2a33729140dd146d0429bd4464c2eceec1e070"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 83040, 512, 2, 32, 0, 0, 0, 0, false, false, false, false, false, true, "5d794bab706ee15feaaa6632b9984c7986e314a5252ed7dd7fa4b97b613efcb1"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 2, 0, 1, 0, false, false, false, false, false, false, "e26abdc484dcf1ef291b807fe7dc7e408246b274159ad6bb54c62f7596b03a41"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 83200, 512, 2, 32, 2, 0, 1, 0, false, false, false, false, false, false, "4a64e862b01cf49e0130aed6937a203901539465c098706f65926aacbac2da61"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 2, 0, 0, 0, false, false, false, false, false, false, "3377efdc8d84d391ae86e0995836de96ea320b0ac1cd06c5f8d974264befeca8"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 83136, 512, 2, 32, 2, 0, 1, 0, false, false, false, false, false, true, "bc3a57f4977d071eac2c1e424dde6e11c8f926736f45379dfe81356938d6ec5e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 83216, 512, 2, 32, 2, 0, 1, 0, false, false, false, false, false, true, "97535052ee3c5042229ddd09e8bf202d3ca1d521d53a0a84636be15f6bb5dfbd"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 83040, 512, 2, 32, 2, 0, 0, 0, false, false, false, false, false, true, "aea8477464b717c885d019e4dea3b2edf7955b77ad9379754a06810ee47f1695"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext", 213408, 384, 1, 0, 1, 0, 1, 0, false, false, false, false, false, false, "8e35cc2bde40e841cff582d0281e0a12b669be59c673b443e9d661efc3a5fd52"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext", 213488, 384, 1, 0, 1, 0, 1, 0, false, false, false, false, false, false, "947d29729d4e727115cd852c11148901be1147bae5e2a66cb1d8de681612b8d7"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 1, 0, 0, 0, false, false, false, false, false, false, "6ca45dbbd886a44bdb8cce8cd6bfa7a86c0fcbc1d51bdff9f7b4d4d056dce272"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext", 213408, 384, 1, 0, 0, 0, 1, 0, false, false, false, false, false, false, "15f36723ce51d4edf70b9d50f2cad751b73cec6238f18767ee7929c110f872fd"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 213504, 384, 1, 0, 1, 0, 1, 0, false, false, false, false, false, true, "8642a63cd261052b857b736ece72e10cd9785be9461d74297473f92428438691"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 213328, 384, 1, 0, 1, 0, 0, 0, false, false, false, false, false, true, "8da9599443df2221c208f374de99b1e0dc9baf77f9d3e840ff4cbadc9d9c759a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext", 213488, 384, 1, 0, 0, 0, 1, 0, false, false, false, false, false, false, "8174f7cb46c6880e7c0f13f0295c1e2e016a02d7989dd35968b3b60a01fd2954"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 0, 0, 0, 0, false, false, false, false, false, false, "8fb70620c4bd9e13f352324c8dab398d80a5843267151b23e8c2f75ca05ffc85"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 213408, 384, 1, 0, 2, 0, 1, 0, false, false, false, false, false, false, "c99fc084db64c421f5563f0d8d64bb946df59bf6ba9c519003f319b9e3971596"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 213504, 384, 1, 0, 0, 0, 1, 0, false, false, false, false, false, true, "b5c483edd75aaac50b0ffb5db923d548c11dfe87bba5ac86000dc2047518431d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext", 213328, 384, 1, 0, 0, 0, 0, 0, false, false, false, false, false, true, "7a5a2aec49e7a762ef3477f587dc9e61c6248b82a41a7335c1218b11807559f7"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 213488, 384, 1, 0, 2, 0, 1, 0, false, false, false, false, false, false, "b912abc0fd8d846b2743ab098a83134a07b570a69bc29bd0cfed779fc622cb38"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 2, 0, 0, 0, false, false, false, false, false, false, "8875dc825f26aeac5a804b3d904c6701dd95c1b71a33b2083ca17e4c6641c1c1"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 32, 1, 0, 1, 0, false, false, false, false, false, false, "d3f3c0250593111f7d2b043a22328016116f8cb46f6db94f443399dcade3af95"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 213504, 384, 1, 0, 2, 0, 1, 0, false, false, false, false, false, true, "d6a9d5c84d3e967bbc28cf22ee64b2e26c437b2424f34f35bad18295b18ddf70"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 213328, 384, 1, 0, 2, 0, 0, 0, false, false, false, false, false, true, "f306b22e543b7f86c0e6ae8567c3514a99bf2ad142f6dcb03a1cd0cf60010cda"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 214352, 384, 2, 32, 1, 0, 1, 0, false, false, false, false, false, false, "edd6a2585b75605f343679ff30f6594bb610468dfd693091f73e99f04ba5c19a"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 1, 0, 0, 0, false, false, false, false, false, false, "9b3688cdaa4b285c15eda8c1c6cb57f697062acc9472f98f41bcf7da37cb03c3"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 32, 0, 0, 1, 0, false, false, false, false, false, false, "8a7ea723b7a471f3f672aa31774bfbc3db865c05885066926b9893323dc6d8ba"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 214368, 384, 2, 32, 1, 0, 1, 0, false, false, false, false, false, true, "fb65048956540e6c97a8d69e3a5321d1db0cd56dff37acc4de5ee42d9a48e8eb"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 214192, 384, 2, 32, 1, 0, 0, 0, false, false, false, false, false, true, "3d9a36b27931e2312ccbd720da6e3b69bac025de0d15021e10316f9b64b439d9"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 214352, 384, 2, 32, 0, 0, 1, 0, false, false, false, false, false, false, "ef5ebd80c95fa6156e234302ba33001545e5548fd4a703b5b8439a6830198885"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 0, 0, 0, 0, false, false, false, false, false, false, "b2f2f491d54bb1589f2640f23f2d4474c533fff025a693911d486d8eb893760a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 32, 2, 0, 1, 0, false, false, false, false, false, false, "3c9eca803b8d966d3aa45c60eba0441c168d454435293fb64acb6b4330cff044"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 214368, 384, 2, 32, 0, 0, 1, 0, false, false, false, false, false, true, "43910c112895eee4e99ed7f83284e1b1e103700ea8924cfa505ecbb5aaab6109"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 214192, 384, 2, 32, 0, 0, 0, 0, false, false, false, false, false, true, "a1831284ecd5b68aabf3ccc7b5a80d3e72c5e26a96cdedb4007322f4b72f4586"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 214352, 384, 2, 32, 2, 0, 1, 0, false, false, false, false, false, false, "36d5703bbf87311abcb6e401117f88e6df553273fae381b0f86359f45a7cb0b2"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 2, 0, 0, 0, false, false, false, false, false, false, "c92c7c0abd3cc40af9633cdb807e769c58f4515c1149db5d06ebb1abb6df1fe4"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext", 41296, 512, 1, 0, 1, 0, 1, 0, false, false, false, false, false, false, "d0a65856e8240bae2fc09e53397965cdc11a0f6294e7d48f25dbe5a93bc151cf"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 214368, 384, 2, 32, 2, 0, 1, 0, false, false, false, false, false, true, "3b928f4f2c1728be4cff69a731bed68d239713738d7f67212f1cefd9cdb8e1b8"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 214192, 384, 2, 32, 2, 0, 0, 0, false, false, false, false, false, true, "0b63b29a77e4701e4ca23a5608b14ad24e86ffff1d92256ea7edeb16b6781d2d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext", 41376, 512, 1, 0, 1, 0, 1, 0, false, false, false, false, false, false, "ba60ad81652359df24ec4c2d0db4baee57a8db33d7e24407db3096c7d6c4c3ac"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 1, 0, 0, 0, false, false, false, false, false, false, "3fe207ba4ceb312cce1cfda31d47fc6abe172a9c415bc27a5ff0126b43cae57f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 41312, 512, 1, 0, 1, 0, 1, 0, false, false, false, false, false, true, "404cd9f07c6bc77942a3512202acc2c992a1207bafc6292ed1c23f371cce324f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 41392, 512, 1, 0, 1, 0, 1, 0, false, false, false, false, false, true, "dd5e5b5c520c78b6f01853d28467c60a846dc4fffe52cad2b9cf0cadcb624af0"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 41216, 512, 1, 0, 1, 0, 0, 0, false, false, false, false, false, true, "44ea2d82cc54bc8176998c83968dda856f9b2d0e0bdcd17b0b335c3c1becd822"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext", 41296, 512, 1, 0, 0, 0, 1, 0, false, false, false, false, false, false, "7a1885138e6ce12cd81fd62589f1abd73c4b843fe15a237ffabe40fd300077e1"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext", 41376, 512, 1, 0, 0, 0, 1, 0, false, false, false, false, false, false, "82e03afffb3db96aa27f3a2e095ccc341af6f96650ab6a7bb3da76752c2d7ef8"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 0, 0, 0, 0, false, false, false, false, false, false, "98aa37e91a39f47d60793e3a2023e2244b828dc07df3db38a0b1d6fde3bdf5cb"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 41312, 512, 1, 0, 0, 0, 1, 0, false, false, false, false, false, true, "11389c97c98c7224f50b0a64cbd2091ac9899b5ee67d096e9bba9b1e0572cc02"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 41392, 512, 1, 0, 0, 0, 1, 0, false, false, false, false, false, true, "ee3f1aa9078874c08fc37d33953b61d3fe55d6ec6877942a44e5ed86e8991a20"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext", 41216, 512, 1, 0, 0, 0, 0, 0, false, false, false, false, false, true, "2870c900aaa69d7e0428638af26eaa645cfa4e786c7ebb1864317dacc8fa19fe"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 41296, 512, 1, 0, 2, 0, 1, 0, false, false, false, false, false, false, "ce14bd4997d365699409889eeb139e1a60b58716bfc4d38f343311ea974b8337"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 41376, 512, 1, 0, 2, 0, 1, 0, false, false, false, false, false, false, "e9154046da49c4aedc4dcb25056db6da0bf256859852c4f7cc0431ebc3af1e9e"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 2, 0, 0, 0, false, false, false, false, false, false, "8d1af3a718bf9c5ae83ffccd5ea2310d15a79bfc4a0ddb43691a35320b1331b4"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 41312, 512, 1, 0, 2, 0, 1, 0, false, false, false, false, false, true, "97e4a0cf355aeac2edd445cdca5868a412fc6c1621399eaeb709c41ecfdddbeb"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 41392, 512, 1, 0, 2, 0, 1, 0, false, false, false, false, false, true, "a0d57af70abf01b352bd12af3c4be0512d1c399682dd4c9952226a1c058db38a"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 41216, 512, 1, 0, 2, 0, 0, 0, false, false, false, false, false, true, "6a60cdaa31caf571d921aee61fefc676e0ac3ea3b9da5aaddc2bb5840703299a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, false, "4b8219df273ef55896dc5c5d1dfaf8b76f5c5c1bbd5fb90cf8bbd9dcd6d375dd"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 42240, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, false, "f0b6aae39a26e178c128fdbdd68934f4377b5dd1ccdc9a048cf1ab0217fa2aed"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 1, 0, 0, 0, false, false, false, false, false, false, "e177ce06e65e67f1e18cef7ca04c014da7e8c2de99e0b1e71964bb78ceffb90e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 42176, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, true, "80df46d2191c9491025c3f517f67346c6db8e53dabfa4f7ea28ab72a18525789"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 42256, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, true, "ffd40e6f845a4c90dfb2e457095fab0f864a17528a4cfad709ca44a8bb8ef879"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 42080, 512, 2, 32, 1, 0, 0, 0, false, false, false, false, false, true, "1964d8a0f6cf2b554d801af16afb5e51e06303aa6a6bed29b293d9198d1fca97"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, false, "7668f10279424aa967c843d95a5124eb3b44a42b84fd42cb8119b5ab20ff0287"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 42240, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, false, "7fa98eb24e9337fa553799070e1d8eeaef8d8623ba175e8f4b6d586d8d8be5bd"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 0, 0, 0, 0, false, false, false, false, false, false, "f64b8628d12d085af99dd34ff07236b02e41ee59a23cb54062e1335791aa9125"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 42176, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, true, "ba2ca60bf5de6a778f25d39e93370fc0f95ccf0a123d6fb1aa6f638915ba5938"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 42256, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, true, "3c0fa5549b741672aa73ec6afd820c8cf07d634e9eb18e761b63a37f707dd4a9"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 42080, 512, 2, 32, 0, 0, 0, 0, false, false, false, false, false, true, "0a8ed98aad627cda1ecdd7854ae9de5d9211959c7c2f350622c105a4976ae257"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 32, 2, 0, 1, 0, false, false, false, false, false, false, "35cdb0304fa7689fad0738ea77da58521479cb4130c2b5b01385d206a52d21a2"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 42240, 512, 2, 32, 2, 0, 1, 0, false, false, false, false, false, false, "ba4a063c3639b107addfc28f0fdbd34d585779d6fd791b52c71be5e181cb3c39"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 2, 0, 0, 0, false, false, false, false, false, false, "480acfd63f5366caa7d9a9a7c1f413a512ab5b18e717c6443b358f87a74e9092"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 42176, 512, 2, 32, 2, 0, 1, 0, false, false, false, false, false, true, "b3813d39aa413d6231b8c5f4d6c480dab3fef004a19ac08a962d237d90b13970"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 42256, 512, 2, 32, 2, 0, 1, 0, false, false, false, false, false, true, "f591ca7c1054fcbf6a6e537ea084c92c759da8e43052da0bbf23186b06a502c6"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 42080, 512, 2, 32, 2, 0, 0, 0, false, false, false, false, false, true, "c3a90d5d18f7b11e31deae4546388b4cb19bb1b339f5295a8af099ab13c88c82"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext", 115024, 512, 1, 0, 1, 0, 1, 0, false, false, false, false, false, false, "307bbb4c6da9f8b88e1ddea35de5e872da8de9e5a68b19d371cbb6402d47e1cb"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext", 115104, 512, 1, 0, 1, 0, 1, 0, false, false, false, false, false, false, "add11ca06b1c225ccfca0f179abf195ac3de82957c2de0409209ab77e3ed6f6a"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext", 114928, 512, 1, 0, 1, 0, 0, 0, false, false, false, false, false, false, "2f8bdd698a4d79aaaf32aafd7d882203e8551aaa694bacde74c2ee5b99da9bbf"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext", 115024, 512, 1, 0, 0, 0, 1, 0, false, false, false, false, false, false, "892e6c4a3db9abc468543f7ae69c913f4c16720f73fc97f341706d842335b0a8"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 115120, 512, 1, 0, 1, 0, 1, 0, false, false, false, false, false, true, "fee81501c7b956464650c6fd9bc5b9d5877a8883a04f105e2fa9acc55c321d0f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 114944, 512, 1, 0, 1, 0, 0, 0, false, false, false, false, false, true, "3dda0bd236db1c9efd02f99dfc61abedfeb0004553925cdadff2455feea4a44d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext", 115104, 512, 1, 0, 0, 0, 1, 0, false, false, false, false, false, false, "9929d5e13b311b58026e16906c6ea98cb3c37acf90840fb7a917f63534060ac1"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128StaticContext", 114928, 512, 1, 0, 0, 0, 0, 0, false, false, false, false, false, false, "a282c9e8c14f72385ca4b0355467a2b9e92f70aa04b7b91c07887290047c4b06"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 115888, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, false, "ff07bb6d35b6b3d9bde7ae1e0fe5885f1f594bf14f3ba5c2ecc6203457f6cb83"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 115120, 512, 1, 0, 0, 0, 1, 0, false, false, false, false, false, true, "41c1f9df84a51b24d8431725f6dade7b534d1f306c0a3012854d32121d9759be"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext", 114944, 512, 1, 0, 0, 0, 0, 0, false, false, false, false, false, true, "bb272191ff1c4be5536277f964680667fefd978cc6cc30bcd2ae3d2728af82c0"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 115968, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, false, "d095e0f859e73c3995b536c109988b0c477294dd6fd754732439efeccfd40aa0"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128StaticContext", 115792, 512, 2, 32, 1, 0, 0, 0, false, false, false, false, false, false, "b48c78a57760cac42ce280861fc47ad6867a3ad1fc00c5e48a57b10b20ee9cc2"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 115888, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, false, "5482fa291e169d2eaefb62ce6a08d1fb4e4f8ae9686bd1733512f3e057efc9a2"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 115984, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, true, "86d557ca128a071e64af0f534d32758957659a9fb4f424075e0a246f49ac0df0"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 115808, 512, 2, 32, 1, 0, 0, 0, false, false, false, false, false, true, "2d0571409bbbbf593b26eac1ffb722b4f84a29af20e941cc6e8079a3f18d3aed"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 115968, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, false, "f2f52aee5b13bdbe73d4733258d24109785bd636ee47e116512ac07d38b590f6"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128StaticContext", 115792, 512, 2, 32, 0, 0, 0, 0, false, false, false, false, false, false, "1805950a8cf3cc7ebd958f67bb9844f1c7b94e7f5e43641ce3de66fab631fc03"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext", 115024, 512, 0, 0, 1, 0, 1, 0, false, false, false, false, false, false, "18fe01ba1205b43e35b710cf2eb04de5e81a5b2f490b5c872caec4d998341b5b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 115984, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, true, "262216593f1bee81c8771321066117f304063d41af0c479fa5ba237fa4ddf29d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 115808, 512, 2, 32, 0, 0, 0, 0, false, false, false, false, false, true, "33be2c918375e7cfa960202c8a257d41ab54e7c65ca1c2e75e5df9135f6e39c9"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext", 115104, 512, 0, 0, 1, 0, 1, 0, false, false, false, false, false, false, "d7282dd112d5cbf16a5d324b5fd6e2a7622cf92556a41e52f1a19dd5ec955c2c"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext", 114928, 512, 0, 0, 1, 0, 0, 0, false, false, false, false, false, false, "4be85eb566f1fe0fe7574228f58ed7f8871f33b0c5006880c7fd1e862facdff4"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext", 115024, 512, 0, 0, 0, 0, 1, 0, false, false, false, false, false, false, "8748946232d90e2838dc2a8b6a5dfa36e942f4a14be25029616213c1a5d2a59b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 115120, 512, 0, 0, 1, 0, 1, 0, false, false, false, false, false, true, "ee4db783b94bb8f50434130ffe92348f947b8ed07cc3f07ee6592959e227b091"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 114944, 512, 0, 0, 1, 0, 0, 0, false, false, false, false, false, true, "110878d72f63cdd78df45ec33a3e3840e2942753664f566e01725d67a6613e08"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext", 115104, 512, 0, 0, 0, 0, 1, 0, false, false, false, false, false, false, "b32fde1da4244f5058ab22b6a117f8300c974bed5074bdaa1e5e9f87000f8cbf"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128StaticContext", 114928, 512, 0, 0, 0, 0, 0, 0, false, false, false, false, false, false, "97b66a8a6f6f41b4cd00bbe0e1dd8ce10156d3266f0e8640a51f0584510bc799"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 1, 0, 1, 0, false, false, false, false, false, false, "4f8c7b11ca211c881e1d95577e87d0e75333650918de6d63b50dd3971024e46d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 115120, 512, 0, 0, 0, 0, 1, 0, false, false, false, false, false, true, "d02114ff0257c7c3447eab4143ab2c23dfa1e7677e66476b1d0a4340d37b7e8b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext", 114944, 512, 0, 0, 0, 0, 0, 0, false, false, false, false, false, true, "8fc3ee09072fed5cbd748d58c1f9c4336daec10db0033e04bde3fc94af8089b7"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqQ128Kv128PersistentContext", 82336, 512, 1, 0, 1, 0, 1, 0, false, false, false, false, false, false, "f84d325ced08aa571636060a233d6d44478d7fbf7ab3f8ec2b975effeae5dd6d"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 1, 0, 0, 0, false, false, false, false, false, false, "6317b56c14a27cdcd0fb29d158c51fe990ebeb4483c9420bebc80bfeb9c6336e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 82272, 512, 1, 0, 1, 0, 1, 0, false, false, false, false, false, true, "2bd6875e20ec07f0ad7802cd383b695138215cb36c1e959a6ae979dbe9361c6d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 82352, 512, 1, 0, 1, 0, 1, 0, false, false, false, false, false, true, "07f8fb50048a66a25d45b418c0c316397c83a48ddb6e874766319116bfbf2530"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 82176, 512, 1, 0, 1, 0, 0, 0, false, false, false, false, false, true, "65ded6b7ee028299706950215dc8b4812fc786a84139439ad3d5ab67837fe42d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 0, 0, 1, 0, false, false, false, false, false, false, "8b27923ffdf8dd4b9a50c795fd6cd2d6e8647a3168e6ac655ffaa40885481e64"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqQ128Kv128PersistentContext", 82336, 512, 1, 0, 0, 0, 1, 0, false, false, false, false, false, false, "52ec1b1b259cb57615902a04bf80e4d3f88d56f1fa699c0b4219d0fdc3b8f43f"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 0, 0, 0, 0, false, false, false, false, false, false, "9fe86df5af71de6d033b9cdbae354d2aa8eae13dbf61fe1dbbbd5c6c828fa766"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 82272, 512, 1, 0, 0, 0, 1, 0, false, false, false, false, false, true, "d6128d11c7480650ec6c6f396d16015173be6449b8b99a61715e3fb73750557e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 82352, 512, 1, 0, 0, 0, 1, 0, false, false, false, false, false, true, "c23174632b5a4adf8658eb5b919f2b6ba6ca80a9283bf131d36bbf72d83b45e7"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext", 82176, 512, 1, 0, 0, 0, 0, 0, false, false, false, false, false, true, "a7c6b4bbe38357b3778a2214fb6db8fdb01ff0955a903e1ccc9c7596d55335e7"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 2, 0, 1, 0, false, false, false, false, false, false, "3c6b7418a2713065acbfa29544aabd9ceacd958ad4016ccf39fae271048e1f4a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 82336, 512, 1, 0, 2, 0, 1, 0, false, false, false, false, false, false, "2c1870a4c8799fc6556820561ab1cea2783df6accf587689b499a4ad7324c40f"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 2, 0, 0, 0, false, false, false, false, false, false, "e291c1645c7e2ec34a991eea9d39a8e2cf1c62f2e773467847bc9fd2698ccf5a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 82272, 512, 1, 0, 2, 0, 1, 0, false, false, false, false, false, true, "73ccf9aca827b4968912ac38fc38f78f51a052730fd1f07d84f0e276fb149181"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 82352, 512, 1, 0, 2, 0, 1, 0, false, false, false, false, false, true, "eec0160ef39e019a13778cc9e5f9a5a379767cd961c0ee59b64923a05dcfc9d8"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 82176, 512, 1, 0, 2, 0, 0, 0, false, false, false, false, false, true, "092dfe4f265da7331b0724928672b454b49b0e40c23d08fda0812b1565f74c66"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, false, "77ab3c88f3bea39173f20d33375ad812edfbc85699d8a1d0721d0d1780509b4a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 83200, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, false, "14f7b5d5aeba9a1056d368a9eba010cbb50669fd95c18b2524073ab84d80d358"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 1, 0, 0, 0, false, false, false, false, false, false, "80f051e0a6ce5a08b3df57a0356beefaeb00053129b0643f1a51528f90633ca4"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 83136, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, true, "42f1bb01decca57527a4a2307c22886acd98940d05afb02beda46919684bddd1"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 83216, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, true, "777db01b07d5ecbecd0f9fa489b7a4e0428234f28708dab7e77358da70770ac6"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 83040, 512, 2, 32, 1, 0, 0, 0, false, false, false, false, false, true, "e649fd8485c0a7a1a40da2677bf0a429626c3355cc059ec9781188d52e4564a4"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, false, "5cafb5399efe52d17b21946e4c0894de5222b810c21d40dbeef08d0d951dbe9b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 83200, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, false, "ef888e9d4bc4db871ff2c1571824bcef9972ca8083be8fa4b1bd0602513af070"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 0, 0, 0, 0, false, false, false, false, false, false, "a5126e6146a885dd030e9f31c4cbff06c8e2f7f6383a17558acf7e9e695bec63"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 83136, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, true, "9bc6f7c7acbf62c26a67470f97280a368c2f5020a5911d12171a17ba9dcd724d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 83216, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, true, "1a8a6d5a09f94d972da2913075529b3d1ed60cc9f7ffdddf1d117656e47d1758"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 83040, 512, 2, 32, 0, 0, 0, 0, false, false, false, false, false, true, "2bad2fd7e4bb1a6e6dfebe3fc3f05be19a56d6675fc722e976e58be17cffe81d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 2, 0, 1, 0, false, false, false, false, false, false, "7b2a021e7d63570313d9f45d7ee723793fae289e7641a6227f80f01f822b46d1"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 83200, 512, 2, 32, 2, 0, 1, 0, false, false, false, false, false, false, "e2fc34996aca231d0c56443d267c9f9dd14a78160162dd85f8891ac50e6a227b"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 2, 0, 0, 0, false, false, false, false, false, false, "bc0a85c7df525db2f25d1589b9ee6996d42f2065ae57fe4145acc37cfd6d8651"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 83136, 512, 2, 32, 2, 0, 1, 0, false, false, false, false, false, true, "1ced5f0bb57a930c638f12fca5defdbb30161cc61ed97a918228ee875a888c4e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 83216, 512, 2, 32, 2, 0, 1, 0, false, false, false, false, false, true, "1bfc7d44c873b5486083fd203046e00078d1ebd34b92833f4dae051bef3f54d1"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 83040, 512, 2, 32, 2, 0, 0, 0, false, false, false, false, false, true, "cf8e34e86e006f1655a4032a289a9d122efae776991650610504340d1be8db5e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqQ128Kv128PersistentContext", 213408, 384, 1, 0, 1, 0, 1, 0, false, false, false, false, false, false, "94fe4f7168e2673aa969923a901240594e63587a7281e3cc2c405a911d6675f1"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqQ128Kv128PersistentContext", 213488, 384, 1, 0, 1, 0, 1, 0, false, false, false, false, false, false, "850a273070d8ce128089294a42f44e58f6a0635b1f8924529e6c7fafc91d6f68"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 1, 0, 0, 0, false, false, false, false, false, false, "50c9bedd367db1739a2cb51489b14e53025466da9367fd1bdda069bcf0f8a4af"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqQ128Kv128PersistentContext", 213408, 384, 1, 0, 0, 0, 1, 0, false, false, false, false, false, false, "d83541d2a7abc1e27405e92255b1a95790141c44fc73ee7ff38a6161b2798cf7"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 213504, 384, 1, 0, 1, 0, 1, 0, false, false, false, false, false, true, "3bcd23bb152282f50f77d7b5672958a539c6a4e6804219d3065e2e3d7e0e2be1"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 213328, 384, 1, 0, 1, 0, 0, 0, false, false, false, false, false, true, "6b6d11e95318fd83fce22a145c5b46f2742f780895b7c701b749b7b09541c334"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqQ128Kv128PersistentContext", 213488, 384, 1, 0, 0, 0, 1, 0, false, false, false, false, false, false, "ecd32a6ff054be967e861a1d21be345f8e10c5ffbf49a5c62cdd4ae1bf8e1065"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 0, 0, 0, 0, false, false, false, false, false, false, "6b2b5d1ae670eb346cd305450494393f5f0b530af97f23000d2aba33340c68e1"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 213408, 384, 1, 0, 2, 0, 1, 0, false, false, false, false, false, false, "6b01808c78068c0de2a80e2bafc763d9a6148b7d4a67d704de0fc1f54bd4167d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 213504, 384, 1, 0, 0, 0, 1, 0, false, false, false, false, false, true, "586c0add029bafe6bb98e5431701dd0978f0b1dec9e0e8d299cd9fc57ba9b2f2"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext", 213328, 384, 1, 0, 0, 0, 0, 0, false, false, false, false, false, true, "0a15ad7c30917080ca312f4d7c13f6959cb0d0f4e18dacb1e4171de549f8d0cf"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 213488, 384, 1, 0, 2, 0, 1, 0, false, false, false, false, false, false, "89622898e71a7f1d0259594a88249da9c6b37c3cdeb4c38e911decdfb44f299e"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 2, 0, 0, 0, false, false, false, false, false, false, "3e9e64848f1a1ae16c81dcebbf72daec424c3c0ea71155a45f4eaec08d31794e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 32, 1, 0, 1, 0, false, false, false, false, false, false, "7f200b55558d59c551cf68957082c08a92f6c5418a8623a1cfaa3af493200180"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 213504, 384, 1, 0, 2, 0, 1, 0, false, false, false, false, false, true, "d715944285cbc6066b1812eaed7f6c7227ae38cdd4eabb7db25f2cd477f6bef6"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 213328, 384, 1, 0, 2, 0, 0, 0, false, false, false, false, false, true, "4ac2185564c8747a3ae4634e3292fff4d0a6c2ecbb8cb3fefd4771ea627d4d2e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 214352, 384, 2, 32, 1, 0, 1, 0, false, false, false, false, false, false, "457467f51718f7d4746a826fd28aec89b51e76126dd749c8ec34a99b83dbb036"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 1, 0, 0, 0, false, false, false, false, false, false, "399aef93072540c236f7b5819935fc9849082762c59eb0f6f876a8f771a3a981"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 32, 0, 0, 1, 0, false, false, false, false, false, false, "e26404db929e70f764677e28750e050079f1b7b473bfdebddac39753c8348629"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 214368, 384, 2, 32, 1, 0, 1, 0, false, false, false, false, false, true, "44b8c497217309f148ef9a4f883ddb0589040da9b6479edc0aa15eb81cc04f43"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 214192, 384, 2, 32, 1, 0, 0, 0, false, false, false, false, false, true, "fb1d9042179226638a565582ef054d7d6b30e836ab84739c2168adec4e5d58b5"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 214352, 384, 2, 32, 0, 0, 1, 0, false, false, false, false, false, false, "b2bd7168d997da92531b27437e65b5ad08bb216a7d81c8d4c1cbfad0bf0fa419"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 0, 0, 0, 0, false, false, false, false, false, false, "e5f94d74378b6b3da44acf6b4182c9cd0de3c64a0a0b7612375f4aaad27fbf0c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 32, 2, 0, 1, 0, false, false, false, false, false, false, "5903373cfc9946259b5b655855ee2b9c50a2a13975a1013909d63a303e300be2"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 214368, 384, 2, 32, 0, 0, 1, 0, false, false, false, false, false, true, "c91b68c5af215587713506286a3c615f892a8dadef74ddeeb01f9225dc1206cc"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 214192, 384, 2, 32, 0, 0, 0, 0, false, false, false, false, false, true, "5b9ce262d4ceb7a6f1e2f92d177308ffc38ea26939901a8b304efcbc5379842d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 214352, 384, 2, 32, 2, 0, 1, 0, false, false, false, false, false, false, "d1dacfad123a91d29a4f17060fa014aff2490517d3250390fa79dadfbc8be113"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 2, 0, 0, 0, false, false, false, false, false, false, "0c417615368d6d16558d894475e084e22c7c77e6ae064e8396ea80beee8cedfe"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqQ128Kv128PersistentContext", 41296, 512, 1, 0, 1, 0, 1, 0, false, false, false, false, false, false, "7fe7fa57ea1abb4237cb48e676d4700909a199c5a7b48f64e739f8a243222221"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 214368, 384, 2, 32, 2, 0, 1, 0, false, false, false, false, false, true, "617dcd5ebfa62de3130607e1b9ccae12bd92777a602c5d050410a82f4c8f97ac"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 214192, 384, 2, 32, 2, 0, 0, 0, false, false, false, false, false, true, "63009ae25027f5fca1a6ea70bef81487914f93dc8e06090260f202d07b82a57f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqQ128Kv128PersistentContext", 41376, 512, 1, 0, 1, 0, 1, 0, false, false, false, false, false, false, "eba081eea30766d1292700681970d32b80d0fa4b26c9a0e58a6ce66543c73322"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 1, 0, 0, 0, false, false, false, false, false, false, "95c1ed0f0dbec3be78ef0996c31243355bf67a752149d59f735abee9f193ea04"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 41312, 512, 1, 0, 1, 0, 1, 0, false, false, false, false, false, true, "755744ab15a3d19d13e5004a5b7b777401c5842bfc74fb3743bb3ea02ab3c798"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 41392, 512, 1, 0, 1, 0, 1, 0, false, false, false, false, false, true, "c53627cc057ffe46efbaeead14302d3826a691f468817f057734d5d6656b6eaa"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 41216, 512, 1, 0, 1, 0, 0, 0, false, false, false, false, false, true, "629f305a1298ed68ba933d547f8e1b94a1727d3f8f0392af3da1bbcf1b33be29"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqQ128Kv128PersistentContext", 41296, 512, 1, 0, 0, 0, 1, 0, false, false, false, false, false, false, "ff443c90f3ed6a5e180cb895c88380f4e7bf56063bf1b956b697f6e450266805"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqQ128Kv128PersistentContext", 41376, 512, 1, 0, 0, 0, 1, 0, false, false, false, false, false, false, "6fddfac6cf592e3cfbbfd57989dd80f08e404bfc436f9a88c94d39b5d3904274"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 0, 0, 0, 0, false, false, false, false, false, false, "31edcb763c33eb06aaa181eb7522f117729de612c73a091e1d6ab98bfde8fcd1"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 41312, 512, 1, 0, 0, 0, 1, 0, false, false, false, false, false, true, "5c0f65130651a0cca2b66d3e9527a0834a6897ff01b5b2adc167eda5097654c4"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 41392, 512, 1, 0, 0, 0, 1, 0, false, false, false, false, false, true, "200cf1744c638c8b4ac6ef160cfb76c59a3e8249159626022c1b5a836fe5feac"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext", 41216, 512, 1, 0, 0, 0, 0, 0, false, false, false, false, false, true, "048874ea0d51efe08cbaa380e432e239693c3cfacb81e9c89d9681f37264abc9"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 41296, 512, 1, 0, 2, 0, 1, 0, false, false, false, false, false, false, "7be12cf2590c9dd2001c57c6e1b046240093d0a31b8e42f140fcaba263e0cea8"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 41376, 512, 1, 0, 2, 0, 1, 0, false, false, false, false, false, false, "e8767d83c6f2226b26e5241e582073d8e83da58c381ed4f006c150934415fd8a"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 2, 0, 0, 0, false, false, false, false, false, false, "cc868b042d5788fedc27316b7446b4eabc93a23228024688240d11f099c7aa07"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 41312, 512, 1, 0, 2, 0, 1, 0, false, false, false, false, false, true, "33973df7b3065f936f2c69e17816aa9f21b880e78ac78a6f610f73f796101b58"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 41392, 512, 1, 0, 2, 0, 1, 0, false, false, false, false, false, true, "3e3fda8bc1f066e1b471e7f352c7acb5a00c40de030318e0e7fde7dd750a51b8"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 41216, 512, 1, 0, 2, 0, 0, 0, false, false, false, false, false, true, "cbe3c10725dbc61d65b913bf46b5ed5982bad39cc08b6e3c67ec2530100ffbd8"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, false, "9ec4ad5c28643dde5ccae2d46b71d3238b9edb0dd41e70cad7ebaa21f771b55d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 42240, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, false, "027a8388e45d4dc99cbc897dc411c7b77bb172ea84998f3007b115a99ae9b62a"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 1, 0, 0, 0, false, false, false, false, false, false, "95f8e7339b314826d70ecbb46452436c3b29ca9187074bbdad2d61396d05d191"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 42176, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, true, "8221a9b9af2e8271deddf28833e12187bc9442b57fb1403d629279ba2699e19b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 42256, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, true, "20f0c91df62161d97993d4047a6186aee28a1cc85877077db82a4f916019edac"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 42080, 512, 2, 32, 1, 0, 0, 0, false, false, false, false, false, true, "f4ba5e7c38ec0cf4e383bf3a387a46fec8f7e578c32fbdbfb2e3c250c54ee172"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, false, "36805aeefdcb04d1bb2fbc47a1157faf15abbae1364978da9d6d4974491f203c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 42240, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, false, "c666c60ed42b858d7538941109565c6ec62c578a29018ca086506cd8d6f9d08f"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 0, 0, 0, 0, false, false, false, false, false, false, "f1d57eac16f17ddb93f7b627a5af288e0a096b0341bc07d67cf464736aa54945"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 42176, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, true, "24f55e611b6879bc68aa8114611fc39dc34fccd6bd5e3631a4e7544ab83ae355"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 42256, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, true, "707b4c37e0d2553646b9e01207dfe765d9d067f4cdcf951818c3a1f1bd18015f"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 42080, 512, 2, 32, 0, 0, 0, 0, false, false, false, false, false, true, "34cd1d1435e54983adf091d16f0a278c39b0db489e5c60cd9846e200fddc359b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 32, 2, 0, 1, 0, false, false, false, false, false, false, "cce10ecdf2715bd822a43fe6e77587e4b2af0d163fe96ee2f934d8f88bf8d4b2"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 42240, 512, 2, 32, 2, 0, 1, 0, false, false, false, false, false, false, "921c237a5c883ddaf82a6caac888d185f851a6e7cce3681d2c35e15a0bd4a4da"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 2, 0, 0, 0, false, false, false, false, false, false, "60dae37b4a13056ff5071531578d19f432a059c8dbc69b6aedbab0c5c4918016"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 42176, 512, 2, 32, 2, 0, 1, 0, false, false, false, false, false, true, "debd08b4a073045cc92be1d5f9624d007e2895d62ae1186d64a5957545aa41de"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 42256, 512, 2, 32, 2, 0, 1, 0, false, false, false, false, false, true, "61efbb424eff9375024935121038161b586203d3b8b5eecb5671637e90c2d584"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 42080, 512, 2, 32, 2, 0, 0, 0, false, false, false, false, false, true, "3c30b252c50c483c477da123e18d8bf036931f44608735edfb52002884b01f40"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 1, 0, 1, 0, false, false, false, false, false, false, "1fe8f6d3767959ce0ede245f2e5a62ae882049107d5d505f091f143392ead443"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqQ128Kv128PersistentContext", 82336, 512, 1, 0, 1, 0, 1, 0, false, false, false, false, false, false, "0860178274e550264990e4f457da2cfd09e47c55e30efa84a7379f5277bcb485"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 1, 0, 0, 0, false, false, false, false, false, false, "c38e83502489922cf9ffea04cba0ffc631f3eb077f2c686cdc20740c08369ebd"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 82272, 512, 1, 0, 1, 0, 1, 0, false, false, false, false, false, true, "fd8fdf62a42bfc5c2e209a2b8174095f66cee670c78399892fd099ad928f15fb"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 82352, 512, 1, 0, 1, 0, 1, 0, false, false, false, false, false, true, "87dc769a3d623ac922e9460d2c0ceb02cb6f8cf45da50c7d4540be4ff7eadae6"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 82176, 512, 1, 0, 1, 0, 0, 0, false, false, false, false, false, true, "289feac6949fb9986501feb2af0c7688dbcab2cf14c3dc2c182eaad8d46b5559"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 0, 0, 1, 0, false, false, false, false, false, false, "df9bab98d11cbcdd8a26624b70bdec7de40fa9cb3f263e7003387f9fbfbf51fc"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqQ128Kv128PersistentContext", 82336, 512, 1, 0, 0, 0, 1, 0, false, false, false, false, false, false, "faed849873af565c6cfd22cbb2247a5ca00223601d36a2ed186909fa82dbb3a8"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 0, 0, 0, 0, false, false, false, false, false, false, "5be5fca8dc01d2cf2db7a1cc7065d971b79ff22a180757a8a28dfc311c2d21fe"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 82272, 512, 1, 0, 0, 0, 1, 0, false, false, false, false, false, true, "b4c3f8cf0bc97164c4c965edcbd936c5afebd44044f93ec88b426ec8fbeb415b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 82352, 512, 1, 0, 0, 0, 1, 0, false, false, false, false, false, true, "4d8e86d234f5792305cc19aeada95c97418cd970ea939579ef28384d15195642"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext", 82176, 512, 1, 0, 0, 0, 0, 0, false, false, false, false, false, true, "b35f8b76a45400eb1b953f67221a57d1fa6b6a1db5414ac019fe5f64958d82a0"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 2, 0, 1, 0, false, false, false, false, false, false, "16148f5e41269b25adc04743ae0ea55288240cd5740bd1760e17c78121d1abc9"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 82336, 512, 1, 0, 2, 0, 1, 0, false, false, false, false, false, false, "304204b76c2220cdb2ed4924e55e7c16ecd78b7e64b95f2c5ac35df30ba26684"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 2, 0, 0, 0, false, false, false, false, false, false, "c8ade6e7cc361fd5c246ab4e12911ff3e4dbb4758f4957d56e357da0331a572a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 82272, 512, 1, 0, 2, 0, 1, 0, false, false, false, false, false, true, "a17fca98782dcd0f27ce9651e9bb32d04e838f5dcdd8da3fcbbc3c148dbab2fd"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 82352, 512, 1, 0, 2, 0, 1, 0, false, false, false, false, false, true, "39dc1d0385dbdbe87539726e53b72ef789acb2dbd7d593416c62298d672a8d25"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 82176, 512, 1, 0, 2, 0, 0, 0, false, false, false, false, false, true, "63a0e8832fac208ac2799a812d494942caf7a41ac43257a4668477986670b6ef"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, false, "1106ed022369c3ffad6e923b57f6083ac1fe7c3093a6b7e6c00fe1ff2a92e68d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 83200, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, false, "7a65a03794051caccbee18832eff98105403e53cc7bf7ba3b1dcdaa81654c52b"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 1, 0, 0, 0, false, false, false, false, false, false, "48dfc4a47ab5881a119f2855d2fa171b232e98628b3a30c6d7900b46ca72fc4a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 83136, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, true, "96fb37f429f5b14bcddd7314617150de9cb7803a24a0ac2d223373cda96ee6d5"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 83216, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, true, "b43e244271587f3b218b2c4bdf21949259ae9ac09defb8fa2245bb4207eb1ae3"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 83040, 512, 2, 32, 1, 0, 0, 0, false, false, false, false, false, true, "ad1e4da9a3b477b3ca4cdb3dbfd081a4f1ad66e1c62a9cfffa7774145dc11534"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, false, "58240c18a539b16ef1c3c63f957a29757f70e6a6f17294acf2b940c5674ad685"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 83200, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, false, "8b9f47a85f25cb21f1b52fd253f1d809a34bfa03223c451c5c318c6e2ec50f99"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 0, 0, 0, 0, false, false, false, false, false, false, "332d0eed5725a4b503ae4419098aeed1e150f8f0c0948fa3661f89acc3ae6684"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 83136, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, true, "9194589918b0c5139ede0ec3f404e9cadbccf9b0a7542f308395aad43f51b88c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 83216, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, true, "adf2b73238d7da9bccb2b542c718b68ca1cec1f306a2ee0f250aa7efe036dd1a"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 83040, 512, 2, 32, 0, 0, 0, 0, false, false, false, false, false, true, "f174618947304ad2c094c462806b3242b9942d61e34b4656947d112e7d030e43"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 2, 0, 1, 0, false, false, false, false, false, false, "27b4629d2b09322e0ff65d0c3a53728b29122866a288cbd97d84cf6017e4c9de"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 83200, 512, 2, 32, 2, 0, 1, 0, false, false, false, false, false, false, "f277648305b1fb7b714a3fc381fc8be9d2544be4f750cca6cca7e53f41e9d6b8"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 2, 0, 0, 0, false, false, false, false, false, false, "73bc641f6920a3db298843ac5a346368f4e9652fca8be32e66053d2eaa8d832a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 83136, 512, 2, 32, 2, 0, 1, 0, false, false, false, false, false, true, "1eaa9d9718292a9bc92f94ea7443819511cdaf69972a1049cdd24ec9668e4302"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 83216, 512, 2, 32, 2, 0, 1, 0, false, false, false, false, false, true, "b8bbfe11b3dd2b44678b9d411c3eb21724664b90d1b2f8ceb18ea098dc8760eb"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 83040, 512, 2, 32, 2, 0, 0, 0, false, false, false, false, false, true, "1adf10de49ec3a9bc3a174baac097d07d3682023538ad06cb6becd3fdb318dc9"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqQ128Kv128PersistentContext", 213408, 384, 1, 0, 1, 0, 1, 0, false, false, false, false, false, false, "b31bebcffd2b1b4543363addca466f058a661c9296c4b27aa34df3609768c057"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqQ128Kv128PersistentContext", 213488, 384, 1, 0, 1, 0, 1, 0, false, false, false, false, false, false, "698b18d0e86a3e92732402fa74f47cf9c43f4444432dd0bb3166aae8a25e9ec0"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 1, 0, 0, 0, false, false, false, false, false, false, "7781f36f651c5408ffa89de39d8f794627e378a79aad87eaa7df1bee4de31bfb"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqQ128Kv128PersistentContext", 213408, 384, 1, 0, 0, 0, 1, 0, false, false, false, false, false, false, "4733e54dcbc4b62584aba84afc604e75d8febe2f9838387b96d04da83f470c6c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 213504, 384, 1, 0, 1, 0, 1, 0, false, false, false, false, false, true, "b7a46333aaf6e417d1bde59766f6a39fd015519454fe109f01d6a19f38de34be"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 213328, 384, 1, 0, 1, 0, 0, 0, false, false, false, false, false, true, "b24f264fc269f1cb6254bdb2fe073539171134847c61054cb1888381ad878aaf"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqQ128Kv128PersistentContext", 213488, 384, 1, 0, 0, 0, 1, 0, false, false, false, false, false, false, "b556d8cbbf6b2a3543203f3e8d78b9c334e3a707cc8f5d45f5aca7fd8d0ffa58"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 0, 0, 0, 0, false, false, false, false, false, false, "0308eb88a129669650aef626878254392794b7227a8c5fa58a3126b67820a309"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 213408, 384, 1, 0, 2, 0, 1, 0, false, false, false, false, false, false, "4f9ae2507bfa0ec09be994f4151320b010a59f320c2d33a7e691896f8da220c2"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 213504, 384, 1, 0, 0, 0, 1, 0, false, false, false, false, false, true, "d02f93cdfc552f71b1cba5a9b8f70b470b5e795fb33b0c6fc14a2794d2329f72"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext", 213328, 384, 1, 0, 0, 0, 0, 0, false, false, false, false, false, true, "6085b8985ec5bb1d33a78068728f28f65cf1b0227bbd0d8895c7a9e021496470"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 213488, 384, 1, 0, 2, 0, 1, 0, false, false, false, false, false, false, "66d436e0ff7ac906df6ce8c2c84ed1a04f4bb88e82f7ea6c8c595854d0bee12f"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 2, 0, 0, 0, false, false, false, false, false, false, "77940f13c89f018977640a5c0936395796e0e072f569e955f4c46876beb258b6"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 32, 1, 0, 1, 0, false, false, false, false, false, false, "79759342783ad0596d96cc925b810b9fa9a72640b9152506b445e59248f32ae6"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 213504, 384, 1, 0, 2, 0, 1, 0, false, false, false, false, false, true, "e183380c6fe75e6efb82d6b9030f1a2e769559cbe6d8829dc9bf38fb7e6add25"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 213328, 384, 1, 0, 2, 0, 0, 0, false, false, false, false, false, true, "3a103ea4edaec506b16b607eafe2f769b6fdf06cf64a96b5cd7da9665222b9b5"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 214352, 384, 2, 32, 1, 0, 1, 0, false, false, false, false, false, false, "d888228ee902d2d051d05806c2e23af5d374a156478939786914786ba71c2024"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 1, 0, 0, 0, false, false, false, false, false, false, "88eb7d0229507de84ec368abd88c45b10877107b325faadb3efbd1536b379f3f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 32, 0, 0, 1, 0, false, false, false, false, false, false, "9f375d9878b60a1a163beb541ec97ac4c84aee64547da67a1495e84de3f39c40"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 214368, 384, 2, 32, 1, 0, 1, 0, false, false, false, false, false, true, "d689402477f54b1b839e7a409bf1df280268ba179270872ca6fa0bffdcc1961d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 214192, 384, 2, 32, 1, 0, 0, 0, false, false, false, false, false, true, "a20afdf26e795f87ea346abacafd872c6fe6fc2fb683e8b1dce44d95c9c41e30"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 214352, 384, 2, 32, 0, 0, 1, 0, false, false, false, false, false, false, "e94182cf59361c482d72fe07e713d0e4cb0bb7dc81c77b2cf4933545626667e7"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 0, 0, 0, 0, false, false, false, false, false, false, "3db2fca1dafae8894cb6fc2d4be6b525d19043485e602300b72a80ff3099c6d7"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 32, 2, 0, 1, 0, false, false, false, false, false, false, "1406a09f5cddb1dbd8355812ce4875100d235f44a2a0a95e8f11e7a6bca86af3"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 214368, 384, 2, 32, 0, 0, 1, 0, false, false, false, false, false, true, "9ae68ac9deea531ef205c169cf4e923515bfdd6b397523d5668f3ff3fd02f93d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 214192, 384, 2, 32, 0, 0, 0, 0, false, false, false, false, false, true, "aba0d3e9142cb6b7f5f6b4a58b5ba505316ea6d62aa4026c185c1fa081f5d4d7"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 214352, 384, 2, 32, 2, 0, 1, 0, false, false, false, false, false, false, "dd98264e97215a1884a45ce6fe5ef8b2ad0f0dd5b7068b8359e5b17dace388dd"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 2, 0, 0, 0, false, false, false, false, false, false, "1ea541c94826beb40789216d3b869afa13b972a158370f6f8211b42343c4c1c0"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqQ128Kv128PersistentContext", 41296, 512, 1, 0, 1, 0, 1, 0, false, false, false, false, false, false, "d2d2ac9f1b18262239f5e8d73548a3cc8a27f31404804e3414d8f4be86fae243"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 214368, 384, 2, 32, 2, 0, 1, 0, false, false, false, false, false, true, "92c1446ad9a640adcf436f72c7cb6ec0322932abf9b30529c042b5ec95fc6a2c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 214192, 384, 2, 32, 2, 0, 0, 0, false, false, false, false, false, true, "00414a591a8ea5b9e2328dd475565e9546318e0486051b3b651c9abe08a92fc4"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqQ128Kv128PersistentContext", 41376, 512, 1, 0, 1, 0, 1, 0, false, false, false, false, false, false, "648784d13429741020225752bf383fd5dc1e9e04efc010f8f287a21376508298"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 1, 0, 0, 0, false, false, false, false, false, false, "d82da5d953958da40b6826cb3d5f524c90d7dd4be24230b089abea33f2afc2c6"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 41312, 512, 1, 0, 1, 0, 1, 0, false, false, false, false, false, true, "5e994221e63aa9658b0814c799b938770ae8006de125b335c4703f34de3868be"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 41392, 512, 1, 0, 1, 0, 1, 0, false, false, false, false, false, true, "c5bf62ae7d1562a7abc2dd6964a7b6771302401f96cd01951dc143dede0fca32"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 41216, 512, 1, 0, 1, 0, 0, 0, false, false, false, false, false, true, "5937ac8fc77108b54022cc3e9e8ef0c3e2b60c9387e63ce8778fd4c4e95b6290"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqQ128Kv128PersistentContext", 41296, 512, 1, 0, 0, 0, 1, 0, false, false, false, false, false, false, "f873c02c2f174ccb3c103e01a42aea6e9e880ff77db744358966f6a2b1e9f9e3"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqQ128Kv128PersistentContext", 41376, 512, 1, 0, 0, 0, 1, 0, false, false, false, false, false, false, "c5fa72134d101905fea83e1b79475e72bdbb4078135634e48b18b5a222612638"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 0, 0, 0, 0, false, false, false, false, false, false, "8569c0a7c869f8ddef21e1804f906e7d528355884a4bf8398d48cee372ef41e3"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 41312, 512, 1, 0, 0, 0, 1, 0, false, false, false, false, false, true, "322ddc5ea3c637218a655692d2433dc4cc5c8895c45a97753379f8ba961c2646"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 41392, 512, 1, 0, 0, 0, 1, 0, false, false, false, false, false, true, "a5250183eb34151629e06ba64ae02e4be6d41a5445e76c1b688e57888aa014b6"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext", 41216, 512, 1, 0, 0, 0, 0, 0, false, false, false, false, false, true, "a300a03431937b1732ff52d79f9fb71dc2202fe9ca69c2d629d408d7a5be5a3d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 41296, 512, 1, 0, 2, 0, 1, 0, false, false, false, false, false, false, "fdae2ae9af8db33a3155c4c879ea3b9dc77db1c086e138ff13d810b40b2daaa0"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 41376, 512, 1, 0, 2, 0, 1, 0, false, false, false, false, false, false, "6d8fe6f9429b16b002375cb5ecf24ae12aaa1ba0a33e71eb95708af0d325a0b2"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 2, 0, 0, 0, false, false, false, false, false, false, "6a1b660759bd3e3a24138f8d74c2a2b16ce1ad9d12c7fec8b6042152dd43ecf3"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 41312, 512, 1, 0, 2, 0, 1, 0, false, false, false, false, false, true, "b6ebd5efba0b6dbcf9c29f24f8e9a7cc9d7a462ec51c447979b847b6e2d1bc9f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 41392, 512, 1, 0, 2, 0, 1, 0, false, false, false, false, false, true, "193fd81dd124cf306aa8c95ed7ecf4fbf3097bc186400b84f3a044e931faea79"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 41216, 512, 1, 0, 2, 0, 0, 0, false, false, false, false, false, true, "3d081936898cb9414997b4d482453c4a00e733dab554d30fac335e2a34ae1852"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, false, "88eed79da4021fe8fa1f720ebc17732375c4350e704530ba3dc32e51504aa7be"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 42240, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, false, "aef6001172bdb4193d61d9bb768a4148eec39ddb1c6e1b064346505314ddaddb"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 1, 0, 0, 0, false, false, false, false, false, false, "107695b9c35331b216719bca74f5de70508196203bddae30a459c5ee80041661"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 42176, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, true, "99a9331b46f1537d3eca70cb8763aeec00c09946e930396b5a9dd6cbf1d9cfb7"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 42256, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, true, "92dd146a4cd08ec865fbb3a13d2b0b7f08c5298ebb6aefec41603486bcff98ba"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 42080, 512, 2, 32, 1, 0, 0, 0, false, false, false, false, false, true, "97d306e3fffd82724c0296d53197bdab840c4d2c039f9c7509e3b59702c81913"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, false, "2aa2b83a25ff356e9c50f9b8bc9693a19b66f0aa3ca4c3f7d870ea8585557401"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 42240, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, false, "e7952fe561a20b58fedd381dcdb596936012f862cb50fb510cbd3cb800cd40f9"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 0, 0, 0, 0, false, false, false, false, false, false, "04038177522fa81a3edb232f5acaf1893b09adc9b7f62edb1f1e2e0d7581ee94"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 42176, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, true, "5fe7d5e1db5bd859967c89a739b0d1e5da159c10e2567b6ec52ea3106f8c8f65"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 42256, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, true, "a95a9e0e98a3c5e331d7a36f39f81e640b37e9a0d4ccffafbecf72a22bfb1885"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 42080, 512, 2, 32, 0, 0, 0, 0, false, false, false, false, false, true, "46d57a97f3c45ebb18905642c6cdf574e919a6c34d1e4768dc015127ef041f80"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 32, 2, 0, 1, 0, false, false, false, false, false, false, "ffe2268f2cfaac428709dce77f435d4f104f98895f378d59d73ae3d626c5b9b3"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 42240, 512, 2, 32, 2, 0, 1, 0, false, false, false, false, false, false, "0e6ec61828d91c120da9ae507d087d5b9bf77d5d5da62ee9000c1230994f36e7"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 2, 0, 0, 0, false, false, false, false, false, false, "35759e882ef37a4030e6382129cf71968b4bbaf3bc5881783a8b5bbf65d86358"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 42176, 512, 2, 32, 2, 0, 1, 0, false, false, false, false, false, true, "c8ec63755cf8ab9cc2fdd284d49a2bda99945ba23851197067fccad7eea611d3"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 42256, 512, 2, 32, 2, 0, 1, 0, false, false, false, false, false, true, "e726ca3f061dd04ebd52c3463aa44d3e7cec7cce92bed08e952ba754694c17c7"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 42080, 512, 2, 32, 2, 0, 0, 0, false, false, false, false, false, true, "ea294dc5c3a9c87a40d2b997fafa79e695d6c207c737ceb93baa2e9e91387883"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 1, 0, 1, 0, false, false, false, false, false, false, "f682e09675bb8d4a5fc9d0bdf432629a1a774438c21097342b0b9177f5bb06b8"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext", 82336, 512, 1, 0, 1, 0, 1, 0, false, false, false, false, false, false, "de9878c5214943c0daa1f5a51ae225ed6b2b2dab947c1c9a4ecb6506a4e82552"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 1, 0, 0, 0, false, false, false, false, false, false, "5b6b4ee8f7b6813f2701e2a121e0a537b676bd8bb24fe4d1b4ab5a9971d69034"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 82272, 512, 1, 0, 1, 0, 1, 0, false, false, false, false, false, true, "fd8f61169b370a57fbd5beb0b04fa741bbb6fdbcf14314cbc141a2dcf8307d9d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 82352, 512, 1, 0, 1, 0, 1, 0, false, false, false, false, false, true, "47aedfd269002d50a85a19cb119777e43d4b883563b8950d3ce3d66deab795e1"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 82176, 512, 1, 0, 1, 0, 0, 0, false, false, false, false, false, true, "555654df291e5da37e8de5b2a8537e83e0c45a352aa94faeed758e56a34b4ca8"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 0, 0, 1, 0, false, false, false, false, false, false, "673ac3a70ce4f72a1913ba7e46348dea66bb43698a26a9c4c3de9add73b6ce76"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext", 82336, 512, 1, 0, 0, 0, 1, 0, false, false, false, false, false, false, "ff0da5afeaf623b20f36b6e841f43cb0471051ef2be6377486efa123949ff681"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 0, 0, 0, 0, false, false, false, false, false, false, "345fb2b796b431e93c46dca7742fec35cbe4fee1f6b04101f33e1490bf4a83a4"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 82272, 512, 1, 0, 0, 0, 1, 0, false, false, false, false, false, true, "8ea8c83ace1228167d38dc9f6abe422ab390577cffab752cf6ebfa39b3d23c94"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 82352, 512, 1, 0, 0, 0, 1, 0, false, false, false, false, false, true, "5d8ee56e403684894759a4206f2aa6781609c82b25c6cbae76c85ddc1e02b1c6"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext", 82176, 512, 1, 0, 0, 0, 0, 0, false, false, false, false, false, true, "fb6b5fffa0ad340e5d32bf4af5e18130758a46e0ab3832dc2c004d7b73458e37"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 2, 0, 1, 0, false, false, false, false, false, false, "ed09e9792697f70f890167416a1eb3c1c42c3801727480bf5791ca460abcf034"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 82336, 512, 1, 0, 2, 0, 1, 0, false, false, false, false, false, false, "5dfcc531ee0bd38fe1235889ae11d60ef3133bdc8e9e9314c02a6066c224b86b"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 2, 0, 0, 0, false, false, false, false, false, false, "00559bdcd6049d8ea20ff1a4618ffda94c0692c1549113d39a2b03cbae6ff1fa"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 82272, 512, 1, 0, 2, 0, 1, 0, false, false, false, false, false, true, "97ca5f4a8be367b0dd494c17da60cd36d4dffc6b573e7aeadae48de46d2c32b0"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 82352, 512, 1, 0, 2, 0, 1, 0, false, false, false, false, false, true, "f83733fed2dc53dd935dd049da540117b9f16f986ba3c5a530e668baf83af6ba"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 82176, 512, 1, 0, 2, 0, 0, 0, false, false, false, false, false, true, "613fb12eb4de55a997c4dbdefd3f3bb38babdd25c3e59496e6aeeeda6bed34bd"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, false, "8ae55cabb47eddc475829487ac37beec7eebb904d152e2d05eb2239b1052e1bd"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 83200, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, false, "7625eb65847ad4fcc319b35f38f51a29bb1d43f855f637b99bd6644b7d3e1dce"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 1, 0, 0, 0, false, false, false, false, false, false, "fbc082736fb600a8eddee6b5e99cd890c8d07bab35ba62b52d2144c3317250e7"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 83136, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, true, "fa219d06d97542abec104548b9f1fb474db6f491da893b4a1aaff3112e9bb90f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 83216, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, true, "a22ba55295d32463740b266148f85e1d2abe846377418b824a91ac7725d4c4db"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 83040, 512, 2, 32, 1, 0, 0, 0, false, false, false, false, false, true, "84add08c279c040f2d5813af71d3088aa54d9fb2910f608015c0d00b45b5304b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, false, "a036aeb57d50c26d0d34f09c2a861904b7d70a6d27076421bb2f333d2038d296"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 83200, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, false, "fd09854c9aaeab34e563737c9f738a0e8d3ba51a2719df5a83ae6344f67d64db"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 0, 0, 0, 0, false, false, false, false, false, false, "eb8324f0220720d6d965c5591b5ce9a48bf81a0a4390e7afbae45c1c04422f2c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 83136, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, true, "ff2794c28fe9e91ff4abe3cbdc4bbdf7903f23bd60bf0928aca5197fb4b6d666"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 83216, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, true, "ad0f304a1dc6730fc30a6074a271a42d893e2e339d66ce9ab7e7f1761661ba64"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 83040, 512, 2, 32, 0, 0, 0, 0, false, false, false, false, false, true, "aa2ef8cf12aad87bcc6144583991300b2ee1f310b1e2c2b119e6af8490c5448c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 2, 0, 1, 0, false, false, false, false, false, false, "018c782168f47e78c29246c3c2644f5cf57accb150626367e7e3cc801ef9e5c9"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 83200, 512, 2, 32, 2, 0, 1, 0, false, false, false, false, false, false, "eb3aeb87731a2a8039334689a23d019ac522b99d6cad32ea935795df6de5e9a9"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 2, 0, 0, 0, false, false, false, false, false, false, "b50f73a2110db73b0a62560302976a5c643e0638e845fd294ba6afd1943b492b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 83136, 512, 2, 32, 2, 0, 1, 0, false, false, false, false, false, true, "e7c842d183818aab83505eab411efce780c24837c07c573f32223a714d5fad50"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 83216, 512, 2, 32, 2, 0, 1, 0, false, false, false, false, false, true, "a39d55115027f589f9f437ffa57cba9a9ffe5616ac95580b6ec1e6b8b6c84a1c"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 83040, 512, 2, 32, 2, 0, 0, 0, false, false, false, false, false, true, "2239e80929f68fb20d6cdd110247d6a832944933d49687deef5de377892db32c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext", 213408, 384, 1, 0, 1, 0, 1, 0, false, false, false, false, false, false, "fb943b1e84b24ae14fff1eac28f9fbc6783cf673d7fc4a53904c2566a2d84e10"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext", 213488, 384, 1, 0, 1, 0, 1, 0, false, false, false, false, false, false, "783e68654c2e20efe3f639af3f04f3f1af32c6a1f39b26b841330fe79a3edf85"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 1, 0, 0, 0, false, false, false, false, false, false, "a47da1ced8314930807bb7ea7a55a2938799caf824db173a55876ef13f8eebbf"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext", 213408, 384, 1, 0, 0, 0, 1, 0, false, false, false, false, false, false, "9af7906906802d8a2cf364144004d3c75b71e2466c01b74a1eca5ad43be16b4b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 213504, 384, 1, 0, 1, 0, 1, 0, false, false, false, false, false, true, "65b520d551e8ef8db832bd2bbbe7037f414065f14edd7a6c71f32ad3fb47975a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 213328, 384, 1, 0, 1, 0, 0, 0, false, false, false, false, false, true, "8e925d25b3f1bdcb6d69d89e7f4b2da776ac89a1792af518cecbc1228ae9977e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext", 213488, 384, 1, 0, 0, 0, 1, 0, false, false, false, false, false, false, "953728d5e9a6fdd529b8e61852e74203c920faea4a11cdab7a32142364190107"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 0, 0, 0, 0, false, false, false, false, false, false, "f0bbe4030ec9eba095eaf8236c5df5d84bfead34a4bb479121799bc9b5ba5876"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 213408, 384, 1, 0, 2, 0, 1, 0, false, false, false, false, false, false, "c540adb96662c92f395cfbecc9aa5755709bd70c19877a5bb2ad1040ee2fa906"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 213504, 384, 1, 0, 0, 0, 1, 0, false, false, false, false, false, true, "a1220a88f6a718c8983224c731b57a3c2203d8e420e52c8539103fcde61453fd"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext", 213328, 384, 1, 0, 0, 0, 0, 0, false, false, false, false, false, true, "4cdc202d2e1aac5eb6da344a89d5df2195498b1cab2fcf7dabd7ee38d668fae5"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 213488, 384, 1, 0, 2, 0, 1, 0, false, false, false, false, false, false, "d9cf954ffc1a402f1a00f8d70368183f02e3226262a5d8e60a9d363445b7863c"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 2, 0, 0, 0, false, false, false, false, false, false, "543bd1009277ddabe3d530c0b354fcb56cd31bdc1736278d7cd739092451aa77"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 32, 1, 0, 1, 0, false, false, false, false, false, false, "a92b58b26b24d2481a98a892d429ea21e9c925eb3d570d8d91b05cde8d827247"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 213504, 384, 1, 0, 2, 0, 1, 0, false, false, false, false, false, true, "ea25fc7e832659cb5b82b4010aa9c409e71054f9b5d1bf04d661294c61e79b67"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 213328, 384, 1, 0, 2, 0, 0, 0, false, false, false, false, false, true, "1cd6b83773f2fe4b33fd257a28d9846496370528303ed2b80392b1028d6bc8a1"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 214352, 384, 2, 32, 1, 0, 1, 0, false, false, false, false, false, false, "430854db3b4c14cd03167a5ece71064f265099e674bc7cbf0dd1c18f1fc6c6ac"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 1, 0, 0, 0, false, false, false, false, false, false, "49a3f31ecaf1cd6556cb6b5c659d790b8779adcd9e742178337a9679836b9248"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 32, 0, 0, 1, 0, false, false, false, false, false, false, "9ce602eef4fd03a8bc6fdfb50f0476c61dd95d8ff45f14f1c75de4d1c3f42f37"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 214368, 384, 2, 32, 1, 0, 1, 0, false, false, false, false, false, true, "c485634c7b175dfedf565d4064633e2586923cb14f7e2bdda11e4644cc87440b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 214192, 384, 2, 32, 1, 0, 0, 0, false, false, false, false, false, true, "cd14c74ff65e5090aadf48428569508ff6e5d973957b33a65dfcd5b9f2916b3c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 214352, 384, 2, 32, 0, 0, 1, 0, false, false, false, false, false, false, "ab4728da53641223e8c191d5f524a51baebcb090497beebd0f3575a0fffbecfc"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 0, 0, 0, 0, false, false, false, false, false, false, "9f2c30237c6ec81888c6f03d67f0d24bd707f470806a23105b0c0129833712bc"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 32, 2, 0, 1, 0, false, false, false, false, false, false, "92dd27373b762c16a2a08040f6d61be52a1095700d05d7e6f5fb382c1791b057"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 214368, 384, 2, 32, 0, 0, 1, 0, false, false, false, false, false, true, "adc101060c4dc9e9493c63c564c7b9832efb207c72772235dd707a1847e297a2"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 214192, 384, 2, 32, 0, 0, 0, 0, false, false, false, false, false, true, "5344efe512bfa4a3b2c429120c60d0d3a5f984cbcda365954f82147ee5385b3f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 214352, 384, 2, 32, 2, 0, 1, 0, false, false, false, false, false, false, "9dd7c0f67dad5fa174d6d49babc17741d48f108b9afd7c42b143267fb5594ebc"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 2, 0, 0, 0, false, false, false, false, false, false, "67b14cfbcdbe231e78e0101306f62d69a4fec6738bb85b91ab754a0803a6ea04"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext", 41296, 512, 1, 0, 1, 0, 1, 0, false, false, false, false, false, false, "955bc494b34e630e920b1e13e91f85983b3408eed9f41fe1ac9189e6a92e5c38"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 214368, 384, 2, 32, 2, 0, 1, 0, false, false, false, false, false, true, "10c7c2c7121cf2564e06ef435c67ad50dbd9c70dbd0b4cc9752415b444cc41e0"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 214192, 384, 2, 32, 2, 0, 0, 0, false, false, false, false, false, true, "f646b80f6d9a5be96de929b3c64ba73d6a9e60b4193417e97742d6a44d0e44f2"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext", 41376, 512, 1, 0, 1, 0, 1, 0, false, false, false, false, false, false, "3defa16e4f8b135b91608576e94b2d25dbc72839aaf91e50785dc0f2149c1537"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 1, 0, 0, 0, false, false, false, false, false, false, "1469ed5d91cdf5e5bb7935851415181010b6936af1c8bece6c3b9149b79f8240"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 41312, 512, 1, 0, 1, 0, 1, 0, false, false, false, false, false, true, "bf120d99943edf1516f9a42754a19b581afea98c801be16543e024ee411ebb96"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 41392, 512, 1, 0, 1, 0, 1, 0, false, false, false, false, false, true, "6b28a94cdbf06e74a3121bd338af48adb4d9c84a0ef26e668c40907606ea2acb"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 41216, 512, 1, 0, 1, 0, 0, 0, false, false, false, false, false, true, "0e08707a536a63cf739e2e509e32599e224836fd579b33ed7edef3c222d5f2d3"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext", 41296, 512, 1, 0, 0, 0, 1, 0, false, false, false, false, false, false, "ef5a22e4a4504845a24d591c298c93e9983882f45cd0b6547e76b4051dd0d73b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext", 41376, 512, 1, 0, 0, 0, 1, 0, false, false, false, false, false, false, "fa1b5743996ee9d43f0bba89daa61aef41ac16ee8f224679b2e2e1b47c9ea42f"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 0, 0, 0, 0, false, false, false, false, false, false, "3203c112e43359c2a08073c2de9b6ec42645d32807308a7c02687d20ad66aa99"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 41312, 512, 1, 0, 0, 0, 1, 0, false, false, false, false, false, true, "ae7efc9b072d84772a530ecf640b37f4880739a026c2040e1e92ec2723b5b0bd"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 41392, 512, 1, 0, 0, 0, 1, 0, false, false, false, false, false, true, "b9bcbd850a68927fca788069d6d5ac77b1a7d2785e19f1f12bada8a1c330dc74"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext", 41216, 512, 1, 0, 0, 0, 0, 0, false, false, false, false, false, true, "6b7c105fc292bf3bdc65469c01bcb26cc2898fc80f8a983a9ef6e7d199854633"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 41296, 512, 1, 0, 2, 0, 1, 0, false, false, false, false, false, false, "d64903aa696ef8c2369083dd63f6dbe5a3b54b1e4fd835e6ec97c05bb3b7061f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 41376, 512, 1, 0, 2, 0, 1, 0, false, false, false, false, false, false, "009d099550aa635fd162ff6cc3666c5d0c019824b17635633c4fc2cb46e0433d"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 2, 0, 0, 0, false, false, false, false, false, false, "89be03603e424cc407e405ad4ab3210f63d99507e11dfb8a005103a1f7f6c023"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 41312, 512, 1, 0, 2, 0, 1, 0, false, false, false, false, false, true, "1919981fb6502d6f28960cc58c9a83c9139ececacecd41533e89aca0e58ef014"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 41392, 512, 1, 0, 2, 0, 1, 0, false, false, false, false, false, true, "db927607e00fb01cf9afe6378b0a9a7a55fd068fe18630fa5c0a29f5936bb4b3"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 41216, 512, 1, 0, 2, 0, 0, 0, false, false, false, false, false, true, "f3750226da7b19a58c7fa11c65d4a096afe9486b64a411cffdd535925a665eca"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, false, "1936e1eaa611233252c14d3f9821594b6fadfccd77b3960b60d434d70367a6bf"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 42240, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, false, "e7e254c7a70fefafe49ddb7cfdc9c18d1e8f9e551545e75f76b2e24584237181"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 1, 0, 0, 0, false, false, false, false, false, false, "467745c2913c68c2fb1993cbc6542d4a1c8791b0f74e2cace0f7c98071df71fc"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 42176, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, true, "5c26975ed40235ece32b9cb2647ba53869ab0c6af73885f51832c3a191617aef"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 42256, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, true, "204388d08c682e240491bc04e4ab89adfaeb7ea0e8e340eb62ebc25ce776ec0e"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 42080, 512, 2, 32, 1, 0, 0, 0, false, false, false, false, false, true, "ff4a0dd8fc437450287e51d2300042cd1d0aaf2e9191b0e1abf2650b710082d9"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, false, "4faa8a03ad6703a5df7e4e2411895e51d5024581bdd7c5eb3104fa7105356499"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 42240, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, false, "451ae83d772a3cb0438422bbe750e21a17d3646f98e92fc0a4960fabf67d89f4"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 0, 0, 0, 0, false, false, false, false, false, false, "2b20da3ed3fede2a2b60eda0e902a11d020dc2909de60604d16d80fbd5c5845f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 42176, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, true, "01a09c067f6bf5e9431504b24e78dfbada76dd5b960a336784740a8fe65920df"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 42256, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, true, "8ae14f6fd9ea37425fef24be276eee44b4f45b09bd9472143c6ebb16b338abfc"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 42080, 512, 2, 32, 0, 0, 0, 0, false, false, false, false, false, true, "d11e48d3a834c2bc53a736f719dfdf4a700533a46a92e286e3a00f0d1f640047"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 32, 2, 0, 1, 0, false, false, false, false, false, false, "bb11255b7d348932540742b378a91a97ec9e61b18aa713a6517cd94d91f679f8"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 42240, 512, 2, 32, 2, 0, 1, 0, false, false, false, false, false, false, "8c03f046f66b4166a00ab1f6f68491618ff232a4c29761fd375880612cb04b59"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 2, 0, 0, 0, false, false, false, false, false, false, "6f421cd81052929a7b0feb8dbed2d8798ecbd4c61d761623775cd507d5583125"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 42176, 512, 2, 32, 2, 0, 1, 0, false, false, false, false, false, true, "76261d38b47ae45dc8eb344c840202af6558a03bddf54ba0b222a381dee737ca"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 42256, 512, 2, 32, 2, 0, 1, 0, false, false, false, false, false, true, "a64df9ce52630d8e5718c8c9fa77375939ab720f87cefb95a3df48c1c0167528"}, { DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 42080, 512, 2, 32, 2, 0, 0, 0, false, false, false, false, false, true, "285a0f9fe490067c77884bb6df96b0a124e3abdcad7e67374a41b9a87055b9b6"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext", 164208, 512, 1, 0, 1, 0, 1, 0, false, false, false, false, false, false, "2d8e7597423d601ca50d6134245707ca97e3ccc3ae4526f6e69540f4bf61d864"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext", 164288, 512, 1, 0, 1, 0, 1, 0, false, false, false, false, false, false, "791d857f9b21a03fbe07d9cda67cd4a0da1c13d5741298a4654bf4502fb53968"}, { DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqQ128Kv128StaticContext", 164112, 512, 1, 0, 1, 0, 0, 0, false, false, false, false, false, false, "990cc384496258b74414f4f764889e2a25deb617ac29a05d7fcdd6b68a1ba1a4"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 164224, 512, 1, 0, 1, 0, 1, 0, false, false, false, false, false, true, "507adbc46a3e0bf14e58b880b1125441ce8b4186d96f02ee7dfa0be66f5305a2"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 164304, 512, 1, 0, 1, 0, 1, 0, false, false, false, false, false, true, "7b3d5fb1b3a2a8b1ab0579b04eeff5e8fab568501d7d1df5efd88671b60d2484"}, { DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 164128, 512, 1, 0, 1, 0, 0, 0, false, false, false, false, false, true, "933e857cde33efed69a9314a231d430190bc4f57606618ee1014168cefaa2339"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext", 164208, 512, 1, 0, 0, 0, 1, 0, false, false, false, false, false, false, "337d866ceba075b1c05ab8e1ad970d2b609a074ff72a290525e3b1bb30ef7f61"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext", 164288, 512, 1, 0, 0, 0, 1, 0, false, false, false, false, false, false, "dc137055006f5f1470b8a4b7507657a69e49d45b42836a4d20da7043772c97f0"}, { DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqQ128Kv128StaticContext", 164112, 512, 1, 0, 0, 0, 0, 0, false, false, false, false, false, false, "5505f7e2c4fb5976d8aee209834ac0da0b63557ea7058c12347b09cd99efcb8e"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 164224, 512, 1, 0, 0, 0, 1, 0, false, false, false, false, false, true, "070e77552c702ff4f901cc592929df3a477a4be641b77790b5da2fc3be09fb20"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 164304, 512, 1, 0, 0, 0, 1, 0, false, false, false, false, false, true, "9a0bee33af09eecf91bac53de5d2ce183f8cd3793bb092486b2c9a7de9e26fe0"}, { DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext", 164128, 512, 1, 0, 0, 0, 0, 0, false, false, false, false, false, true, "eeb623d07435e14a2a3f5f8ec15302ab4e6b9607b978dcb35eca7fc9445e5319"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 164208, 512, 1, 0, 2, 0, 1, 0, false, false, false, false, false, false, "2ba46e1a5b2b59745ee8960701c31319855e83833ab26d691a68031019903501"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 164288, 512, 1, 0, 2, 0, 1, 0, false, false, false, false, false, false, "0c2060c6ce93ce430765fcd72bd122a0159ae20375333082fcac4f648885d2cb"}, { DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 164112, 512, 1, 0, 2, 0, 0, 0, false, false, false, false, false, false, "ebcb11ef821475f1094da11fabab221aa855e3d707fe174f53508a072eaef105"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 164224, 512, 1, 0, 2, 0, 1, 0, false, false, false, false, false, true, "bf2429bb48671ab09e229130d31a2a01b00fed36d9326ff7b401a2b1e2807eb8"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 164304, 512, 1, 0, 2, 0, 1, 0, false, false, false, false, false, true, "8a8b28f4600bfbe166f5dfb865e2b71d1732aaa45f98805587fe14d010f33bcd"}, { DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 164128, 512, 1, 0, 2, 0, 0, 0, false, false, false, false, false, true, "799cbf370edeb637d8bf6ff4c388a11be6da38754a6594913189fe0f20a93350"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 165072, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, false, "1340027469b34b7938a9f4b061b62a613dcc8eaee15a65669ae9c1893a4bf185"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 165152, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, false, "8bfcbdd4b75ffccbea85f56890d65f9432fbf48e847d650ea668e4c9fada5473"}, { DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext", 164976, 512, 2, 32, 1, 0, 0, 0, false, false, false, false, false, false, "8d4e381686a0e5091e03df186efa91be0a2734c1dd47429971209c9e47277233"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 165088, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, true, "02b6289a2d822b9e07bc55fbbc49ed8176b343c143c9e12a5b57938833864a4a"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 165168, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, true, "c1ffd4fb0b1185a726d4997bdff74d3b39506944c0b3282952da16228a1533de"}, { DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 164992, 512, 2, 32, 1, 0, 0, 0, false, false, false, false, false, true, "601e77ffade16ad3e07daa2f7d12ef9d044d5c345bb05c8aed3a5e78e4970af6"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 165072, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, false, "df437316b6841b9fb4495e2b9909dac4a872163826de038958c5f770cab7de4e"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 165152, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, false, "24a8b291149dea4a40b9a60ff678a797ce6f37f5dedbdd9174c7faaf0484a87c"}, { DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext", 164976, 512, 2, 32, 0, 0, 0, 0, false, false, false, false, false, false, "275a81e459e23389ea3a79daf7a18911f0bf782059b03f1b49b2684bebfe6025"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 165088, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, true, "7b92c7daa4560fb4a78ec948e018de7c0ee104dc48a16978ee4de89fd946a3f3"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 165168, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, true, "9e8fed406889b781922d7450717f726aa4c24b720f3dcd63eabd164ca6b2cd17"}, { DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 164992, 512, 2, 32, 0, 0, 0, 0, false, false, false, false, false, true, "11fa072ced2213eab88f5e3d4e17623d7f1f5543cac0611ffb0d75b30850c589"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 165072, 512, 2, 32, 2, 0, 1, 0, false, false, false, false, false, false, "779489d42185c10d513d33babc17bd54f642db8c9233c43e87a58232bdf55150"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 165152, 512, 2, 32, 2, 0, 1, 0, false, false, false, false, false, false, "a5f4571acfa39e734977d6e9635e4473d5eb50672240199d0be1602d96bba409"}, { DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 164976, 512, 2, 32, 2, 0, 0, 0, false, false, false, false, false, false, "6aa7b6c29d9e44e73cb9ebded650a23674d9a8a2b0403a1a0656684db36f6f49"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 165088, 512, 2, 32, 2, 0, 1, 0, false, false, false, false, false, true, "b06030cc08d7f1abce378c49923a3521a91cbd3b58d5b15a05480690ddff1c08"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 165168, 512, 2, 32, 2, 0, 1, 0, false, false, false, false, false, true, "d4991c9d38bc91a92257ff9b4d4704f0156cd3a1495ba887b3154166c3a34d6e"}, { DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 164992, 512, 2, 32, 2, 0, 0, 0, false, false, false, false, false, true, "e12eee78a0e4395874fab0d31511dfbef73b155832cd02b5658e7acb1dd0a894"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext", 196928, 384, 1, 0, 1, 0, 1, 0, false, false, false, false, false, false, "168584fab8d49e72adb64ecbd7d9de45f5f8056c88009896e6ca0594acda921c"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext", 197008, 384, 1, 0, 1, 0, 1, 0, false, false, false, false, false, false, "f059ad3ef1cd1aab9bb224b138555b42c20c32efb165eb2953d3fb360a301f0a"}, { DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqQ128Kv128StaticContext", 196832, 384, 1, 0, 1, 0, 0, 0, false, false, false, false, false, false, "66e98c27614610ea0ab6f6332e77d9f6d8f42a48611c99f0c3e5469e89e60699"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext", 196928, 384, 1, 0, 0, 0, 1, 0, false, false, false, false, false, false, "30de055d5ce17a38af5d8c67722ed753c7dccec313ffc2d70496604e0fe2bc21"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 197024, 384, 1, 0, 1, 0, 1, 0, false, false, false, false, false, true, "a045d8dfc2b31778e7bc5e076ae820cacb866a17076b5a3676a4801d12758271"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 196848, 384, 1, 0, 1, 0, 0, 0, false, false, false, false, false, true, "d6ffe437e8af1577a6f7f66ce5e15074b17934c78e79c0d8bb36dd896f2a4959"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext", 197008, 384, 1, 0, 0, 0, 1, 0, false, false, false, false, false, false, "8afa8252a9932efb447add23ee98450a9122fb4edc5ff625dcd9afd25f19279f"}, { DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqQ128Kv128StaticContext", 196832, 384, 1, 0, 0, 0, 0, 0, false, false, false, false, false, false, "bda33c50c70a6ffa91eec5a342173ff27ebb82d6ea828b14e24206e5beb13013"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 196928, 384, 1, 0, 2, 0, 1, 0, false, false, false, false, false, false, "115fc569a4b81af9159c4a9be8b6b27434f8d59ccb40584b04b753c43523414c"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 197024, 384, 1, 0, 0, 0, 1, 0, false, false, false, false, false, true, "04d24bb5e1458f09d1fe370a47f766ca738d863e2f61197c5055eb812fde32a8"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext", 196848, 384, 1, 0, 0, 0, 0, 0, false, false, false, false, false, true, "dff3d38e3a649a8e2fba7b9e445412f9de6c494341caf15454043022200a6bc0"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 197008, 384, 1, 0, 2, 0, 1, 0, false, false, false, false, false, false, "4eb087738f3c75e4f4d16935ddfdc920a7f059a14ca568e86f66e523648ea9d3"}, { DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 196832, 384, 1, 0, 2, 0, 0, 0, false, false, false, false, false, false, "96357cdd75a24329909dc91507d7dd7055c6df74ff7dd10f2a942204fdadeef2"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 197792, 384, 2, 32, 1, 0, 1, 0, false, false, false, false, false, false, "c05ba73382d6abcfcf1a4d403c881932278ba212babf65c819d341021c536704"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 197024, 384, 1, 0, 2, 0, 1, 0, false, false, false, false, false, true, "cbde9ac42b47773bccf789a092c634a67b0d18e462ef158cd099a104b89b1427"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 196848, 384, 1, 0, 2, 0, 0, 0, false, false, false, false, false, true, "28fa5dff384c8c613333669d9cebee1f60aef3ac9df5f0423aa3d5fe0e33efd7"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 197872, 384, 2, 32, 1, 0, 1, 0, false, false, false, false, false, false, "633ddd2218f3bc98e814a1194d0d36917dd5c5cb1913334c4ca5935e30404536"}, { DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext", 197696, 384, 2, 32, 1, 0, 0, 0, false, false, false, false, false, false, "52c2db1b14dfa27669351185de199d42541bc5e58c2302be36085f29e338a01a"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 197792, 384, 2, 32, 0, 0, 1, 0, false, false, false, false, false, false, "4e992469577615ae2d7025d356e7b8621fb3440dbccead2fe2d2a52f7b926a5d"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 197888, 384, 2, 32, 1, 0, 1, 0, false, false, false, false, false, true, "ef798fa76cae8c51ecf13f475b6a1ff225da588c8326081074e006a9b9dac8bd"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 197712, 384, 2, 32, 1, 0, 0, 0, false, false, false, false, false, true, "fe654a67b6dea1d0630b382a73ef770d1cbe7cdd89e4876668ec4dbb3b64365b"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 197872, 384, 2, 32, 0, 0, 1, 0, false, false, false, false, false, false, "a1c6c2245d503c449b9e34afea4f2dc8e00ce97c8a9af3f2235c1d792ccc9baf"}, { DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext", 197696, 384, 2, 32, 0, 0, 0, 0, false, false, false, false, false, false, "bab071e2fa6f4e57bf53ea07b31f59bdd2841f3937399001bd5801c982101069"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 197792, 384, 2, 32, 2, 0, 1, 0, false, false, false, false, false, false, "6978a2e5f498f080dd591f63ac780bd4d7dd2d2204d1f448169433571c8a1107"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 197888, 384, 2, 32, 0, 0, 1, 0, false, false, false, false, false, true, "f30b300420a5b45d434611282c728e890518eef5284114dd4208ca0abee28623"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 197712, 384, 2, 32, 0, 0, 0, 0, false, false, false, false, false, true, "6f71d3d3e58aa5221ffef0124ae144742c6e50e5a3a08a5de642cf1b1af309be"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 197872, 384, 2, 32, 2, 0, 1, 0, false, false, false, false, false, false, "814b7689ff58960b82e982cda43fab9cfd1392bc956d51abde47337db4fba884"}, { DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 197696, 384, 2, 32, 2, 0, 0, 0, false, false, false, false, false, false, "5917fffafd9b2fc0bc3d311e41cd38525c9c425f574c5c7faca5598c4b82b00f"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 1, 0, 1, 0, false, false, false, false, false, false, "d7eefed97583117b26485db4bd131c43bf71d8912a57bf6e866c93f58d838ea0"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 197888, 384, 2, 32, 2, 0, 1, 0, false, false, false, false, false, true, "94e1d4cd09ebc56772188bd42858f67dd03461e639e962cc02c258afa8cb2988"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 197712, 384, 2, 32, 2, 0, 0, 0, false, false, false, false, false, true, "ed0cc593c6b1f8dcf72a8bdbd1193a9ccd23d4166b56da9359f9e8579cf8abeb"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext", 82336, 512, 1, 0, 1, 0, 1, 0, false, false, false, false, false, false, "bac61cd3aaaac0b6a1d03fc41a4c7066b82fe599126e24b3798440dee54fa113"}, { DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 1, 0, 0, 0, false, false, false, false, false, false, "1c574c7981f60b3cab2611badc18350213e27c486f3445793f7e09fa3507f36b"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 82272, 512, 1, 0, 1, 0, 1, 0, false, false, false, false, false, true, "2a8ef83bb3be481a93e0dab8a9314b2b8ae187b0787c25a7e23d30ab90a2dc61"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 82352, 512, 1, 0, 1, 0, 1, 0, false, false, false, false, false, true, "ee342ff0127a831d9526d6689566e875104510b161125a2c4f17469ffcd4fa7b"}, { DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 82176, 512, 1, 0, 1, 0, 0, 0, false, false, false, false, false, true, "f39123f7ecccc55c68c9ce20808d393139ceae99a76ad42bd7345f4c3fe58981"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 0, 0, 1, 0, false, false, false, false, false, false, "aa54d18274d665fb627c46552e778e16f58355c5e22652925168e83bf1fe980c"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext", 82336, 512, 1, 0, 0, 0, 1, 0, false, false, false, false, false, false, "4e678e6afd1eb47b47d0dbfe6c8ccbfb3fc39f946af7c0b6e5f50d0bebf667b1"}, { DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 0, 0, 0, 0, false, false, false, false, false, false, "0aa10546a04b5bfded99951e76dcd856320cd43bfef74eccf15d837c436a8f22"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 82272, 512, 1, 0, 0, 0, 1, 0, false, false, false, false, false, true, "bb87eade6bdb55e8b865db5f79e4e1ffb98ee41c73a660572bf6b5ef6a507600"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 82352, 512, 1, 0, 0, 0, 1, 0, false, false, false, false, false, true, "e02c56ef41c4255b64f7953502b1092bc5777568cf05c635e19023eb4eb4868a"}, { DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext", 82176, 512, 1, 0, 0, 0, 0, 0, false, false, false, false, false, true, "c7fb92c04b8aba7b6b7610b0584f1a8f9af3cc8cb0e9c80c64ece9baa7ced241"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 2, 0, 1, 0, false, false, false, false, false, false, "a834b385c3848b17001896c68b1f2e682afae34db89bfe28dee7068b45bdea0a"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 82336, 512, 1, 0, 2, 0, 1, 0, false, false, false, false, false, false, "c3927af45d1cd87171e7da565de818dd8b1ed29d203bfe4abad56c208decc86d"}, { DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 2, 0, 0, 0, false, false, false, false, false, false, "00583a18c484b658b6af34a645f4fbd08951d6bb92f84c93bbcdcba0454f6d8b"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 82272, 512, 1, 0, 2, 0, 1, 0, false, false, false, false, false, true, "26f5125c556f626fd33be20efa3959e998473b48101fa63eb7745c56f2b1e57e"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 82352, 512, 1, 0, 2, 0, 1, 0, false, false, false, false, false, true, "e1cd720642ab1e84d356769f5bcea6a443d8ef1b8388606496e5b25488ea3d67"}, { DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 82176, 512, 1, 0, 2, 0, 0, 0, false, false, false, false, false, true, "d1fb938661a42b731fcbe40b7ccda0bde09930a2d739d8864ca077b4e38e4a79"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, false, "03816ea3fe8c93dc9598d2690f43bab9028b7ff0a8c542b7ff6efdff1cf1562e"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 83200, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, false, "6d22bf86256a1e88cf9dfc10cffed0dd341ccb8766f3d46c7d67c71da834e528"}, { DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 1, 0, 0, 0, false, false, false, false, false, false, "5be4e7566e57d03ae2aa35139d799cb1e75250854401a99f458e05725825339f"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 83136, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, true, "5a48ba6c12bae5d201eb20ea717f2319ea1b175716619b4b6f6c7624594c9cac"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 83216, 512, 2, 32, 1, 0, 1, 0, false, false, false, false, false, true, "54a62be46cbf5f5edd8a4e0a0ad974b4676f0fff40de4d8ad9c15af50d0c625f"}, { DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 83040, 512, 2, 32, 1, 0, 0, 0, false, false, false, false, false, true, "c81caae44e25100847136cb2a1b758c5475a12a82e5061be4c59eb6a2971cd2e"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, false, "87911058581eee47707fd791f430717a6d23694d367501ed3e6747e79269927e"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 83200, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, false, "5cd7a07bc5868d11c2758de3bddd52f6eb30566f66be5b557d0146f6ecbb99ba"}, { DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 0, 0, 0, 0, false, false, false, false, false, false, "8b954252f30f2f49a587c33a2953b0903abe5e84dc0e9e7d339cb61711e7681c"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 83136, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, true, "4009e2cd14c33d2e06839c02db1508dad2c4ee1e83514f12e4e542a853575317"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 83216, 512, 2, 32, 0, 0, 1, 0, false, false, false, false, false, true, "22da67063e1f0167fd4bded850095dc0c830d6eeca6e7d31cdb25c9149751b6d"}, { DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 83040, 512, 2, 32, 0, 0, 0, 0, false, false, false, false, false, true, "7d13c14994cd6d7829662f9e198a135374676d5229754d0b5ef714c867a1a788"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 2, 0, 1, 0, false, false, false, false, false, false, "e6e803c61389b514215161bfc1738c4f51a20a0183cc860b66efa42dcc42151e"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 83200, 512, 2, 32, 2, 0, 1, 0, false, false, false, false, false, false, "5e3800367f6fb8425d0cac1b37d47667348904e7c1d334244301f27c0056bbdd"}, { DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 2, 0, 0, 0, false, false, false, false, false, false, "64c046fe46126a7a81feb8ee815cf4f90065b3e9d6058fbab3a957d99f1175e4"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 83136, 512, 2, 32, 2, 0, 1, 0, false, false, false, false, false, true, "1da43cb26389beff06ef56dbd9a0dad0dad51743ff60ff3c6de7961ebf85462c"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 83216, 512, 2, 32, 2, 0, 1, 0, false, false, false, false, false, true, "fe3ab3ae04e2e57ae01219284673d047076df2f1b8243fc10ba1a108754aa90a"}, { DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 83040, 512, 2, 32, 2, 0, 0, 0, false, false, false, false, false, true, "b08c0484f1368367a733738e4d6a56d9b0c3c9b4fa73961326ce218bd9dfc4ea"}, #endif // EXCLUDE_SM_100 }; diff --git a/tensorrt_llm/_torch/modules/linear.py b/tensorrt_llm/_torch/modules/linear.py index d23b48cdc0..6f601455f4 100644 --- a/tensorrt_llm/_torch/modules/linear.py +++ b/tensorrt_llm/_torch/modules/linear.py @@ -752,7 +752,7 @@ class FP8QDQLinearMethod(UnquantizedLinearMethod): self.rescale_fused_weights(module) # Handle kv_scales for NVFP4 KV cache - if os.environ.get("TRTLLM_LOAD_KV_SCALES", "0") == "1": + if os.environ.get("TRTLLM_LOAD_KV_SCALES", "1") == "1": k_scales = getattr(module, "tmp_k_scales", []) v_scales = getattr(module, "tmp_v_scales", []) if k_scales: @@ -1364,8 +1364,7 @@ class NVFP4LinearMethod(LinearMethodBase): # Load k and v scales, used for NVFP4 KV cache k_scale, v_scale = self.load_kv_scales(weights) - # NOTE: Currently the calibrated kv scales may cause overflow for certain input, disabling by default. - if os.environ.get("TRTLLM_LOAD_KV_SCALES", "0") == "1": + if os.environ.get("TRTLLM_LOAD_KV_SCALES", "1") == "1": if len(k_scale) != 0: assert len(v_scale) != 0 # The calibrated KV scales are amax / (6 * 448), but the requested KV scales are amax / 448, diff --git a/tests/integration/defs/accuracy/references/longbench_v1.yaml b/tests/integration/defs/accuracy/references/longbench_v1.yaml index e54288d094..a3b0b84d21 100644 --- a/tests/integration/defs/accuracy/references/longbench_v1.yaml +++ b/tests/integration/defs/accuracy/references/longbench_v1.yaml @@ -6,3 +6,7 @@ Qwen3/Qwen3-30B-A3B-Instruct-2507: accuracy: 47.102 - extra_acc_spec: "target_sparsity=0.9" accuracy: 46.169 +deepseek-ai/DeepSeek-V3-0324: + - quant_algo: NVFP4 + extra_acc_spec: "target_sparsity=0.9" + accuracy: 44.94 diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py index 7a6db59559..f6d0f42304 100644 --- a/tests/integration/defs/accuracy/test_llm_api_pytorch.py +++ b/tests/integration/defs/accuracy/test_llm_api_pytorch.py @@ -2659,6 +2659,50 @@ class TestDeepSeekR1(LlmapiAccuracyTestHarness): task.evaluate(llm) +@pytest.mark.timeout(14400) +@pytest.mark.skip_less_device(8) +class TestDeepSeekV3(LlmapiAccuracyTestHarness): + MODEL_NAME = "deepseek-ai/DeepSeek-V3-0324" + MODEL_PATH = f"{llm_models_root()}/DeepSeek-V3-0324-FP4" + + @skip_pre_blackwell + @pytest.mark.skip_less_device_memory(95000) + @pytest.mark.parametrize( + "target_sparsity,thr_prefill,thr_decode", + [ + (0.9, 1418.142868970396, 863.147841750025), + ], + ids=["target_sparsity_0.9"], + ) + def test_skip_softmax_attention_multi_gpus(self, target_sparsity: float, + thr_prefill: float, + thr_decode: float): + sparse_attention_config = SkipSoftmaxAttentionConfig( + threshold_scale_factor={ + "prefill": thr_prefill, + "decode": thr_decode, + }) + kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.70, + enable_block_reuse=False) + + sm_version = get_sm_version() + if sm_version not in (100, 103): + pytest.skip( + f"Skip softmax MLA attention is not supported on SM {sm_version}" + ) + + with LLM(self.MODEL_PATH, + attn_backend="TRTLLM", + tensor_parallel_size=8, + max_batch_size=32, + max_num_tokens=100000, + kv_cache_config=kv_cache_config, + sparse_attention_config=sparse_attention_config) as llm: + task = LongBenchV1(self.MODEL_NAME) + task.evaluate(llm, + extra_acc_spec=f"target_sparsity={target_sparsity}") + + @pytest.mark.timeout(14400) @pytest.mark.skip_less_device_memory(80000) class TestDeepSeekV32(LlmapiAccuracyTestHarness): diff --git a/tests/integration/test_lists/test-db/l0_b200.yml b/tests/integration/test_lists/test-db/l0_b200.yml index b0f6d9c356..1ccfa2e60d 100644 --- a/tests/integration/test_lists/test-db/l0_b200.yml +++ b/tests/integration/test_lists/test-db/l0_b200.yml @@ -19,6 +19,7 @@ l0_b200: - accuracy/test_llm_api_pytorch.py::TestLlama3_1_8B::test_nvfp4_streaming[stream_interval_64] - accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_nvfp4_kv[attn_backend=TRTLLM-torch_compile=False] - accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_nvfp4_kv[attn_backend=TRTLLM-torch_compile=True] + - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3::test_skip_softmax_attention_multi_gpus[target_sparsity_0.9] - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=0-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False-enable_chunked_prefill=False] - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=2-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False-enable_chunked_prefill=False] - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=2-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False-enable_chunked_prefill=True] diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt index 16ac94ff94..6e3c30458f 100644 --- a/tests/integration/test_lists/waives.txt +++ b/tests/integration/test_lists/waives.txt @@ -334,6 +334,7 @@ accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B_Instruct_2507::test_skip_sof accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B_Instruct_2507::test_skip_softmax_attention_2gpus[target_sparsity_0.9] SKIP (https://nvbugs/5810940) accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B_Instruct_2507::test_skip_softmax_attention[target_sparsity_0.5] SKIP (https://nvbugs/5810940) accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B_Instruct_2507::test_skip_softmax_attention[target_sparsity_0.9] SKIP (https://nvbugs/5810940) +accuracy/test_llm_api_pytorch.py::TestDeepSeekV3::test_skip_softmax_attention_multi_gpus[target_sparsity_0.9] SKIP (https://nvbugs/5810940) accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[ep4-mtp_nextn=0-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=True] SKIP (https://nvbugs/5800646) accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp4-mtp_nextn=0-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=True] SKIP (https://nvbugs/5800646) accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=2-tp4-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False] SKIP (https://nvbugs/5800672)