diff --git a/.gitattributes b/.gitattributes index 6f2d66838c..7b111ed877 100644 --- a/.gitattributes +++ b/.gitattributes @@ -12,3 +12,5 @@ tests/integration/test_input_files/*.jpg filter=lfs diff=lfs merge=lfs -text docs/source/blogs/media/tech_blog10_baseline_performance_detail.png filter=lfs diff=lfs merge=lfs -text docs/source/blogs/media/tech_blog10_full_strategy_performance.png filter=lfs diff=lfs merge=lfs -text docs/source/blogs/media/tech_blog10_context_wait_performance.png filter=lfs diff=lfs merge=lfs -text +cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/kernelMetaInfo_cubin.cpp filter=lfs diff=lfs merge=lfs -text +cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/cubin/xqa_kernel_cubin.cpp filter=lfs diff=lfs merge=lfs -text diff --git a/.gitignore b/.gitignore index 712957ddd5..130ea9837b 100644 --- a/.gitignore +++ b/.gitignore @@ -74,6 +74,7 @@ llm-test-workspace/ cpp/include/tensorrt_llm/executor/version.h cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/fmha_v2_cu/ cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_cubin.h +cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_cubin.cpp .devcontainer/.env /examples/layer_wise_benchmarks/profiles/ diff --git a/benchmarks/cpp/utils/utils.cpp b/benchmarks/cpp/utils/utils.cpp index 3a7c885c32..0cbcf1c046 100644 --- a/benchmarks/cpp/utils/utils.cpp +++ b/benchmarks/cpp/utils/utils.cpp @@ -1,6 +1,7 @@ /* - * SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & + *AFFILIATES. All rights reserved. * SPDX-License-Identifier: Apache-2.0 * * Licensed under the Apache License, Version 2.0 (the "License"); @@ -17,13 +18,16 @@ */ #include "utils.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/logger.h" #include #include #include -namespace tensorrt_llm::benchmark +TRTLLM_NAMESPACE_BEGIN + +namespace benchmark { std::vector> parseVectorOfVectors(std::string const& input) @@ -98,7 +102,8 @@ Samples parseWorkloadJson( if (samples.size() < maxNumSamples) { TLLM_LOG_WARNING( - "Dataset size %zu is smaller than given max_num_samples %d, max_num_samples will be ignored.\n", + "Dataset size %zu is smaller than given max_num_samples " + "%d, max_num_samples will be ignored.\n", samples.size(), maxNumSamples); } return samples; @@ -160,4 +165,6 @@ std::ostream& operator<<(std::ostream& os, RecordBwMetric const& metric) return os; } -} // namespace tensorrt_llm::benchmark +} // namespace benchmark + +TRTLLM_NAMESPACE_END diff --git a/benchmarks/cpp/utils/utils.h b/benchmarks/cpp/utils/utils.h index 13e9fe1206..375a1cd9bf 100644 --- a/benchmarks/cpp/utils/utils.h +++ b/benchmarks/cpp/utils/utils.h @@ -16,6 +16,7 @@ * limitations under the License. */ +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/executor/executor.h" #include @@ -29,7 +30,9 @@ #pragma once -namespace tensorrt_llm::benchmark +TRTLLM_NAMESPACE_BEGIN + +namespace benchmark { // using namespace tensorrt_llm::batch_manager; @@ -237,4 +240,6 @@ std::vector generateRandomExponentialValues(int count, float lambda, int std::vector computeTimeDelays(BenchmarkParams const& benchmarkParams, int numDelays); -} // namespace tensorrt_llm::benchmark +} // namespace benchmark + +TRTLLM_NAMESPACE_END diff --git a/cpp/include/tensorrt_llm/common/algorithm.h b/cpp/include/tensorrt_llm/common/algorithm.h index 9363504f75..9fcf7b2b4a 100644 --- a/cpp/include/tensorrt_llm/common/algorithm.h +++ b/cpp/include/tensorrt_llm/common/algorithm.h @@ -16,8 +16,9 @@ #pragma once -namespace tensorrt_llm -{ +#include "tensorrt_llm/common/config.h" + +TRTLLM_NAMESPACE_BEGIN // Base class for algorithms struct Algorithm @@ -29,4 +30,4 @@ struct Algorithm Algorithm& operator=(Algorithm const&) = delete; }; -} // namespace tensorrt_llm +TRTLLM_NAMESPACE_END diff --git a/cpp/include/tensorrt_llm/common/arrayView.h b/cpp/include/tensorrt_llm/common/arrayView.h index 31dcd74532..ce4ceb9ed6 100644 --- a/cpp/include/tensorrt_llm/common/arrayView.h +++ b/cpp/include/tensorrt_llm/common/arrayView.h @@ -17,9 +17,13 @@ #pragma once #include "tensorrt_llm/common/assert.h" +#include "tensorrt_llm/common/config.h" + #include -namespace tensorrt_llm::common +TRTLLM_NAMESPACE_BEGIN + +namespace common { //! @@ -100,4 +104,6 @@ private: size_type mSize; }; -} // namespace tensorrt_llm::common +} // namespace common + +TRTLLM_NAMESPACE_END diff --git a/cpp/include/tensorrt_llm/common/assert.h b/cpp/include/tensorrt_llm/common/assert.h index 0e916b7746..d53630ab5d 100644 --- a/cpp/include/tensorrt_llm/common/assert.h +++ b/cpp/include/tensorrt_llm/common/assert.h @@ -16,14 +16,19 @@ #pragma once +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/tllmException.h" +TRTLLM_NAMESPACE_BEGIN + class DebugConfig { public: static bool isCheckDebugEnabled(); }; +TRTLLM_NAMESPACE_END + #if defined(_WIN32) #define TLLM_LIKELY(x) (__assume((x) == 1), (x)) #define TLLM_UNLIKELY(x) (__assume((x) == 0), (x)) @@ -35,8 +40,8 @@ public: #define TLLM_CHECK(val) \ do \ { \ - TLLM_LIKELY(static_cast(val)) ? ((void) 0) \ - : tensorrt_llm::common::throwRuntimeError(__FILE__, __LINE__, #val); \ + TLLM_LIKELY(static_cast(val)) \ + ? ((void) 0) : tensorrt_llm::common::throwRuntimeError(__FILE__, __LINE__, #val); \ } while (0) #define TLLM_CHECK_WITH_INFO(val, info, ...) \ @@ -51,17 +56,17 @@ public: #define TLLM_CHECK_DEBUG(val) \ do \ { \ - if (TLLM_UNLIKELY(DebugConfig::isCheckDebugEnabled())) \ + if (TLLM_UNLIKELY(tensorrt_llm::DebugConfig::isCheckDebugEnabled())) \ { \ - TLLM_LIKELY(static_cast(val)) ? ((void) 0) \ - : tensorrt_llm::common::throwRuntimeError(__FILE__, __LINE__, #val); \ + TLLM_LIKELY(static_cast(val)) \ + ? ((void) 0) : tensorrt_llm::common::throwRuntimeError(__FILE__, __LINE__, #val); \ } \ } while (0) #define TLLM_CHECK_DEBUG_WITH_INFO(val, info, ...) \ do \ { \ - if (TLLM_UNLIKELY(DebugConfig::isCheckDebugEnabled())) \ + if (TLLM_UNLIKELY(tensorrt_llm::DebugConfig::isCheckDebugEnabled())) \ { \ TLLM_LIKELY(static_cast(val)) \ ? ((void) 0) \ diff --git a/cpp/include/tensorrt_llm/common/bindingUtils.h b/cpp/include/tensorrt_llm/common/bindingUtils.h index 83f72c676a..d61e1f7a14 100644 --- a/cpp/include/tensorrt_llm/common/bindingUtils.h +++ b/cpp/include/tensorrt_llm/common/bindingUtils.h @@ -17,9 +17,13 @@ #pragma once #include "c10/util/intrusive_ptr.h" +#include "tensorrt_llm/common/config.h" + #include -namespace tensorrt_llm::common +TRTLLM_NAMESPACE_BEGIN + +namespace common { // Adapted from pybind11's example implementation: @@ -69,4 +73,6 @@ c10::intrusive_ptr get_intrusive_ptr(PyObject* py_obj, std::string pybind11_a return c10::intrusive_ptr::reclaim_copy(p); } -} // namespace tensorrt_llm::common +} // namespace common + +TRTLLM_NAMESPACE_END diff --git a/cpp/include/tensorrt_llm/common/config.h b/cpp/include/tensorrt_llm/common/config.h new file mode 100644 index 0000000000..71b97f9ab5 --- /dev/null +++ b/cpp/include/tensorrt_llm/common/config.h @@ -0,0 +1,62 @@ +/* + * Copyright (c) 2022-2025, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once +#ifndef TRTLLM_CONFIG_H +#define TRTLLM_CONFIG_H + +/** + * \def TRTLLM_ABI_NAMESPACE + * This macro is used to open an implicitly inline namespace block for the ABI version. + * This macro can be overridden to change the ABI version. + * The default ABI version is _v1. + */ +#ifndef TRTLLM_ABI_NAMESPACE +#define TRTLLM_ABI_NAMESPACE _v1 +#endif + +#ifndef TRTLLM_ABI_NAMESPACE_BEGIN +#define TRTLLM_ABI_NAMESPACE_BEGIN \ + inline namespace TRTLLM_ABI_NAMESPACE \ + { +#endif + +#ifndef TRTLLM_ABI_NAMESPACE_END +#define TRTLLM_ABI_NAMESPACE_END } +#endif + +/** + * \def TRTLLM_NAMESPACE_BEGIN + * This macro is used to open a `tensorrt_llm::` namespace block, along with any + * enclosing namespaces requested by TRTLLM_WRAPPED_NAMESPACE, etc. + * This macro is defined by TensorRT-LLM and may not be overridden. + */ +#define TRTLLM_NAMESPACE_BEGIN \ + namespace tensorrt_llm \ + { \ + TRTLLM_ABI_NAMESPACE_BEGIN + +/** + * \def TRTLLM_NAMESPACE_END + * This macro is used to close a `tensorrt_llm::` namespace block, along with any + * enclosing namespaces requested by TRTLLM_WRAPPED_NAMESPACE, etc. + * This macro is defined by TensorRT-LLM and may not be overridden. + */ +#define TRTLLM_NAMESPACE_END \ + TRTLLM_ABI_NAMESPACE_END \ + } /* end namespace tensorrt_llm */ + +#endif // TRTLLM_CONFIG_H diff --git a/cpp/include/tensorrt_llm/common/cudaFp8Utils.h b/cpp/include/tensorrt_llm/common/cudaFp8Utils.h index 373aabc96c..75dae28eff 100644 --- a/cpp/include/tensorrt_llm/common/cudaFp8Utils.h +++ b/cpp/include/tensorrt_llm/common/cudaFp8Utils.h @@ -16,6 +16,8 @@ #pragma once +#include "tensorrt_llm/common/config.h" + #ifdef ENABLE_FP8 #include #include @@ -29,8 +31,8 @@ #define USE_QGMMA #endif -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace common { @@ -320,5 +322,6 @@ void invokeComputeScalesAndQuantizeMatrix(T_OUT* output, T_S* quant_ptr, const T const int64_t lda, QuantizeMode quantize_mode, cudaStream_t stream); } // namespace common -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END #endif // ENABLE_FP8 diff --git a/cpp/include/tensorrt_llm/common/cudaProfilerUtils.h b/cpp/include/tensorrt_llm/common/cudaProfilerUtils.h index 985f4619ee..4f369c0592 100644 --- a/cpp/include/tensorrt_llm/common/cudaProfilerUtils.h +++ b/cpp/include/tensorrt_llm/common/cudaProfilerUtils.h @@ -14,12 +14,18 @@ * limitations under the License. */ +#pragma once + +#include "tensorrt_llm/common/config.h" + #include #include #include #include -namespace tensorrt_llm::common +TRTLLM_NAMESPACE_BEGIN + +namespace common { /// @brief Populate the start and end profiling iteration indexes from the provided environment variables @@ -28,4 +34,6 @@ namespace tensorrt_llm::common std::pair, std::unordered_set> populateIterationIndexes( std::string const& envVarName, std::optional const& legacyEnvVarName = std::nullopt); -} // namespace tensorrt_llm::common +} // namespace common + +TRTLLM_NAMESPACE_END diff --git a/cpp/include/tensorrt_llm/common/cudaUtils.h b/cpp/include/tensorrt_llm/common/cudaUtils.h index 6626b18e38..3a11df85b1 100644 --- a/cpp/include/tensorrt_llm/common/cudaUtils.h +++ b/cpp/include/tensorrt_llm/common/cudaUtils.h @@ -16,6 +16,7 @@ */ #pragma once +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaBf16Wrapper.h" #include "tensorrt_llm/common/cudaDriverWrapper.h" #include "tensorrt_llm/common/cudaFp8Utils.h" @@ -49,7 +50,9 @@ // this undef. #endif // WIN32 -namespace tensorrt_llm::common +TRTLLM_NAMESPACE_BEGIN + +namespace common { // workspace for cublas gemm : 32MB @@ -1417,7 +1420,9 @@ DEFINE_MEMBER_CHECKER(deq) DEFINE_MEMBER_CHECKER(qua) DEFINE_MEMBER_CHECKER(high_preciecion_normed_output) -} // namespace tensorrt_llm::common +} // namespace common + +TRTLLM_NAMESPACE_END /* * Macros compliant with TensorRT coding conventions diff --git a/cpp/include/tensorrt_llm/common/dataType.h b/cpp/include/tensorrt_llm/common/dataType.h index 6c19322135..2f19404f9c 100644 --- a/cpp/include/tensorrt_llm/common/dataType.h +++ b/cpp/include/tensorrt_llm/common/dataType.h @@ -16,11 +16,15 @@ #pragma once +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/tllmException.h" + #include #include -namespace tensorrt_llm::common +TRTLLM_NAMESPACE_BEGIN + +namespace common { constexpr static size_t getDTypeSize(nvinfer1::DataType type) @@ -84,4 +88,6 @@ constexpr static size_t getDTypeSizeInBits(nvinfer1::DataType type) return ""; } -} // namespace tensorrt_llm::common +} // namespace common + +TRTLLM_NAMESPACE_END diff --git a/cpp/include/tensorrt_llm/common/logger.h b/cpp/include/tensorrt_llm/common/logger.h index c8164b10e5..5477415edf 100644 --- a/cpp/include/tensorrt_llm/common/logger.h +++ b/cpp/include/tensorrt_llm/common/logger.h @@ -22,9 +22,12 @@ #include #include "tensorrt_llm/common/assert.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/stringUtils.h" -namespace tensorrt_llm::common +TRTLLM_NAMESPACE_BEGIN + +namespace common { class Logger @@ -125,12 +128,12 @@ private: static inline std::string getPrefix(Level const level) { - return fmtstr("%s[%s] ", kPREFIX, getLevelName(level)); + return tensorrt_llm::common::fmtstr("%s[%s] ", kPREFIX, getLevelName(level)); } static inline std::string getPrefix(Level const level, int const rank) { - return fmtstr("%s[%s][%d] ", kPREFIX, getLevelName(level), rank); + return tensorrt_llm::common::fmtstr("%s[%s][%d] ", kPREFIX, getLevelName(level), rank); } }; @@ -171,6 +174,9 @@ void Logger::log(Logger::Level const level, int const rank, char const* format, out << std::endl; } } +} // namespace common + +TRTLLM_NAMESPACE_END #define TLLM_LOG(level, ...) \ do \ @@ -188,4 +194,3 @@ void Logger::log(Logger::Level const level, int const rank, char const* format, #define TLLM_LOG_WARNING(...) TLLM_LOG(tensorrt_llm::common::Logger::WARNING, __VA_ARGS__) #define TLLM_LOG_ERROR(...) TLLM_LOG(tensorrt_llm::common::Logger::ERROR, __VA_ARGS__) #define TLLM_LOG_EXCEPTION(ex, ...) tensorrt_llm::common::Logger::getLogger()->log(ex, ##__VA_ARGS__) -} // namespace tensorrt_llm::common diff --git a/cpp/include/tensorrt_llm/common/optionalRef.h b/cpp/include/tensorrt_llm/common/optionalRef.h index af93ac6d36..f55b377981 100644 --- a/cpp/include/tensorrt_llm/common/optionalRef.h +++ b/cpp/include/tensorrt_llm/common/optionalRef.h @@ -16,11 +16,15 @@ #pragma once +#include "tensorrt_llm/common/config.h" + #include #include #include -namespace tensorrt_llm::common +TRTLLM_NAMESPACE_BEGIN + +namespace common { /** @@ -100,4 +104,6 @@ public: } }; -} // namespace tensorrt_llm::common +} // namespace common + +TRTLLM_NAMESPACE_END diff --git a/cpp/include/tensorrt_llm/common/quantization.h b/cpp/include/tensorrt_llm/common/quantization.h index 50aae114e0..df13a674d6 100644 --- a/cpp/include/tensorrt_llm/common/quantization.h +++ b/cpp/include/tensorrt_llm/common/quantization.h @@ -16,12 +16,14 @@ #pragma once +#include "tensorrt_llm/common/config.h" + #include #include #include -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace common { @@ -480,4 +482,5 @@ public: }; } // namespace common -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/include/tensorrt_llm/common/stringUtils.h b/cpp/include/tensorrt_llm/common/stringUtils.h index a4803cba37..f4cf8a89be 100644 --- a/cpp/include/tensorrt_llm/common/stringUtils.h +++ b/cpp/include/tensorrt_llm/common/stringUtils.h @@ -16,6 +16,7 @@ #pragma once +#include "tensorrt_llm/common/config.h" #if ENABLE_BF16 #include #endif // ENABLE_BF16 @@ -28,7 +29,9 @@ #include #include -namespace tensorrt_llm::common +TRTLLM_NAMESPACE_BEGIN + +namespace common { #if ENABLE_BF16 static inline std::basic_ostream& operator<<(std::basic_ostream& stream, __nv_bfloat16 const& val) @@ -228,4 +231,6 @@ inline void toUpper(std::string& s) } } -} // namespace tensorrt_llm::common +} // namespace common + +TRTLLM_NAMESPACE_END diff --git a/cpp/include/tensorrt_llm/common/tllmException.h b/cpp/include/tensorrt_llm/common/tllmException.h index 9d222a0ca9..c705e1cf89 100644 --- a/cpp/include/tensorrt_llm/common/tllmException.h +++ b/cpp/include/tensorrt_llm/common/tllmException.h @@ -16,6 +16,7 @@ #pragma once +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/stringUtils.h" #include @@ -41,7 +42,9 @@ tensorrt_llm::common::RequestSpecificException( \ __FILE__, __LINE__, tensorrt_llm::common::fmtstr(__VA_ARGS__).c_str(), requestID, errorCode) -namespace tensorrt_llm::common +TRTLLM_NAMESPACE_BEGIN + +namespace common { /// @brief Enumeration of different error codes for request-specific exceptions @@ -77,7 +80,8 @@ private: [[noreturn]] inline void throwRuntimeError(char const* const file, int const line, char const* info) { - throw TllmException(file, line, fmtstr("[TensorRT-LLM][ERROR] Assertion failed: %s", info).c_str()); + throw TllmException( + file, line, tensorrt_llm::common::fmtstr("[TensorRT-LLM][ERROR] Assertion failed: %s", info).c_str()); } [[noreturn]] inline void throwRuntimeError(char const* const file, int const line, std::string const& info = "") @@ -102,4 +106,6 @@ private: RequestErrorCode mErrorCode; }; -} // namespace tensorrt_llm::common +} // namespace common + +TRTLLM_NAMESPACE_END diff --git a/cpp/include/tensorrt_llm/common/utils.h b/cpp/include/tensorrt_llm/common/utils.h index 2a0ff72b53..22e6b628bb 100644 --- a/cpp/include/tensorrt_llm/common/utils.h +++ b/cpp/include/tensorrt_llm/common/utils.h @@ -16,6 +16,8 @@ #pragma once +#include "tensorrt_llm/common/config.h" + #include #include #include @@ -24,7 +26,9 @@ #include #endif -namespace tensorrt_llm::common +TRTLLM_NAMESPACE_BEGIN + +namespace common { inline bool setThreadName(std::string const& name) @@ -43,4 +47,6 @@ bool contains(std::initializer_list const& c, T const& v) return std::find(c.begin(), c.end(), v) != c.end(); } -} // namespace tensorrt_llm::common +} // namespace common + +TRTLLM_NAMESPACE_END diff --git a/cpp/include/tensorrt_llm/kernels/archCondition.h b/cpp/include/tensorrt_llm/kernels/archCondition.h index ef86d5745e..4d633d046b 100644 --- a/cpp/include/tensorrt_llm/kernels/archCondition.h +++ b/cpp/include/tensorrt_llm/kernels/archCondition.h @@ -16,7 +16,11 @@ #pragma once -namespace tensorrt_llm::kernels +#include "tensorrt_llm/common/config.h" + +TRTLLM_NAMESPACE_BEGIN + +namespace kernels { namespace detail @@ -110,4 +114,6 @@ inline constexpr bool is_compatible_v = is_compatible::value; } // namespace arch -} // namespace tensorrt_llm::kernels +} // namespace kernels + +TRTLLM_NAMESPACE_END diff --git a/cpp/include/tensorrt_llm/kernels/decodingCommon.h b/cpp/include/tensorrt_llm/kernels/decodingCommon.h index 116a85e2ee..aa7e2f961f 100644 --- a/cpp/include/tensorrt_llm/kernels/decodingCommon.h +++ b/cpp/include/tensorrt_llm/kernels/decodingCommon.h @@ -17,11 +17,14 @@ #pragma once #include "tensorrt_llm/common/assert.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/executor/types.h" #include #include -namespace tensorrt_llm::kernels +TRTLLM_NAMESPACE_BEGIN + +namespace kernels { class FinishedState @@ -308,4 +311,6 @@ template void invokeScatterDecodingParams( T const* src, T scalar, T* dst, int const* batchSlots, int batchSize, cudaStream_t stream); -} // namespace tensorrt_llm::kernels +} // namespace kernels + +TRTLLM_NAMESPACE_END diff --git a/cpp/include/tensorrt_llm/kernels/kvCacheIndex.h b/cpp/include/tensorrt_llm/kernels/kvCacheIndex.h index e664db6400..6f9c2c78a1 100644 --- a/cpp/include/tensorrt_llm/kernels/kvCacheIndex.h +++ b/cpp/include/tensorrt_llm/kernels/kvCacheIndex.h @@ -17,11 +17,14 @@ #pragma once #include "tensorrt_llm/common/assert.h" +#include "tensorrt_llm/common/config.h" #include #include -namespace tensorrt_llm::kernels +TRTLLM_NAMESPACE_BEGIN + +namespace kernels { class KVCacheIndex @@ -53,4 +56,6 @@ private: UnderlyingType value; }; -} // namespace tensorrt_llm::kernels +} // namespace kernels + +TRTLLM_NAMESPACE_END diff --git a/cpp/include/tensorrt_llm/kernels/kvCachePartialCopy.h b/cpp/include/tensorrt_llm/kernels/kvCachePartialCopy.h index 0119d8948a..6a6ac75ffa 100644 --- a/cpp/include/tensorrt_llm/kernels/kvCachePartialCopy.h +++ b/cpp/include/tensorrt_llm/kernels/kvCachePartialCopy.h @@ -14,16 +14,18 @@ * limitations under the License. */ +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/runtime/iBuffer.h" using namespace tensorrt_llm::runtime; -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { void kvCacheBlockPartialCopy(IBuffer& dst, IBuffer const& src, unsigned int numLayers, unsigned int numHeads, unsigned int tokensPerBlock, unsigned int numHidden, unsigned int numTokensToCopy, int kvFactor, cudaStream_t stream); } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/kernels/fmha_v2/setup.py b/cpp/kernels/fmha_v2/setup.py index 43a175ba80..75b4f2f56e 100644 --- a/cpp/kernels/fmha_v2/setup.py +++ b/cpp/kernels/fmha_v2/setup.py @@ -2175,7 +2175,8 @@ def get_kernel_code(kspec, kname, lname): params_str = 'reinterpret_cast(params)' if generate_cu_trtllm else 'params' attn_mask_type_str = 'using Attention_mask_type = ContextAttentionMaskType;' if generate_cu_trtllm else 'using Attention_mask_type = fmha::Attention_mask_type;' bert_launch_params = '' if generate_cu_trtllm else 'using Launch_params = bert::Fused_multihead_attention_launch_params;' - include_str = '#include "../fused_multihead_attention_common.h"' if generate_cu_trtllm else '' + include_str = '#include "../fused_multihead_attention_common.h"\n' if generate_cu_trtllm else '' + include_str += '#include "tensorrt_llm/common/config.h"' if generate_cu_trtllm else '' num_compute_groups_str = '' if generate_cu_trtllm else 'static constexpr int NUM_COMPUTE_GROUPS = 2;' fused_multihead_attention_params_v2_str = 'Fused_multihead_attention_params_v2' if generate_cu_trtllm else f'{params_type}' const_fused_multihead_attention_params_v2_str = 'Fused_multihead_attention_params_v2' if generate_cu_trtllm else f'const {params_type}' @@ -2201,8 +2202,19 @@ def get_kernel_code(kspec, kname, lname): const int COMPUTE_REG_COUNT = {compute_reg_count}; asm volatile("{{setmaxnreg.inc.sync.aligned.u32 %0; \n\t}}" ::"n"(COMPUTE_REG_COUNT));'''.format( compute_reg_count=compute_reg_count) - local_ns_open = ns_open if generate_cu_trtllm else '' - local_ns_close = ns_close if generate_cu_trtllm else '' + abi_ns_open = r""" +TRTLLM_NAMESPACE_BEGIN +namespace kernels +{ +// clang-format off +""" + abi_ns_close = r""" +// clang-format on +} // namespace kernels +TRTLLM_NAMESPACE_END +""" + local_ns_open = abi_ns_open if generate_cu_trtllm else '' + local_ns_close = abi_ns_close if generate_cu_trtllm else '' tmp = dict(locals(), **kspec._asdict()) @@ -3077,8 +3089,10 @@ def use_cubin_header(sm, head_size, dtype, output_dtype=None): def get_cubin_header(kernel_traits, specs_names): cubins = [] cubin_lens = [] + launchers = [] cubins_dict = {} cubin_lens_dict = {} + launchers_dict = {} for kspec, fname, lname, kname in specs_names: if generate_cu_trtllm and not use_cubin_header( kspec.sm, kspec.head_size, kspec.dtype, kspec.output_dtype): @@ -3282,11 +3296,11 @@ def get_cubin_header(kernel_traits, specs_names): if generate_cu_trtllm and lname != 'nullptr': launcher = 'extern void {lname}(Fused_multihead_attention_params_v2& params, const Launch_params& launch_params, cudaStream_t stream);'.format( lname=lname) - if int(sm) in cubins_dict: - if launcher not in cubins_dict[int(sm)]: - cubins_dict[int(sm)].append(launcher) + if int(sm) in launchers_dict: + if launcher not in launchers_dict[int(sm)]: + launchers_dict[int(sm)].append(launcher) else: - cubins_dict[int(sm)] = [launcher] + launchers_dict[int(sm)] = [launcher] elif 'mhca' in kname: code = '''\ {{ DATA_TYPE_{prec}, {seq_len}, {q_step}, {kv_step}, {head_size}, kSM_{sm}, {cubin_name}, {cubin_name}_len, \"{kname}\", {smem}, {threads}, {meta_unroll_step}, {is_il} }}\ @@ -3309,17 +3323,33 @@ def get_cubin_header(kernel_traits, specs_names): else: metadata_v2 = ',\n'.join(metadata_v2) # Add macros to only include needed cubins during compilation. - for sm in cubins_dict.keys(): + # Collect all SM versions from all dictionaries + all_sms = sorted( + set( + list(cubins_dict.keys()) + list(cubin_lens_dict.keys()) + + list(launchers_dict.keys()))) + + for sm in all_sms: macro_begin = f"#ifndef EXCLUDE_SM_{sm}" macro_end = f"#endif\n" - cubins.extend([macro_begin] + cubins_dict[sm] + [macro_end]) + + # Add cubin array declarations + if sm in cubins_dict: + cubins.extend([macro_begin] + cubins_dict[sm] + [macro_end]) + + # Add cubin length declarations if sm in cubin_lens_dict: cubin_lens.extend([macro_begin] + cubin_lens_dict[sm] + [macro_end]) + # Add launcher declarations + if sm in launchers_dict: + launchers.extend([macro_begin] + launchers_dict[sm] + [macro_end]) + unroll_config_v1 = ',\n'.join(unroll_config_v1) unroll_config_v2 = ',\n'.join(unroll_config_v2) cubins = '\n'.join(cubins) cubin_lens = '\n'.join(cubin_lens) + launchers = '\n'.join(launchers) local_ns_open = ns_open local_ns_close = ns_close if generate_cu_trtllm else '}' launcher_line = ''' @@ -3431,7 +3461,157 @@ static const struct TestMetaV2 '''.format(**locals(), copyright=copyright) - return code + # Generate header content (.h file) + if "GENERATE_CUBIN" in os.environ: + header_content = '''\ +{copyright} +#pragma once + +#include "tensorrt_llm/common/config.h" + +TRTLLM_NAMESPACE_BEGIN +namespace kernels{{ + +struct FusedMultiHeadAttentionKernelMetaInfoV2 +{{ + Data_type mDataTypeIn; + Data_type mDataTypeOut; + unsigned int mS; + unsigned int mStepQ; + unsigned int mStepKV; + unsigned int mD; + unsigned int mDV; + unsigned int mSageBlockSizeQ; + unsigned int mSageBlockSizeK; + unsigned int mSageBlockSizeV; + unsigned int mSM; + const unsigned char* mCubin; + unsigned int mCubinSize; + const char* mFuncName; + unsigned int mSharedMemBytes; + unsigned int mThreadsPerCTA; + unsigned int mUnrollStep; + int mAttentionMaskType; + int mAttentionInputLayout; + bool mInterleaved; + bool mFlashAttention; + bool mWarpSpecialization; + bool mFP32Accumulation; + bool mAlibiSupported; + bool mTiled; + bool mEnableAttnLogitSoftcapping; + bool mReturnSoftmaxStats;{launcher_line} +}}; + +extern const FusedMultiHeadAttentionKernelMetaInfoV2 sMhaKernelMetaInfosV2[]; +extern const int sMhaKernelMetaInfosV2Size; + +}} // namespace kernels +TRTLLM_NAMESPACE_END +'''.format(**locals(), copyright=copyright) + # Generate source content (.cpp file) + source_content = '''\ +{copyright} + +#include "tensorrt_llm/common/config.h" + +#include +#include +#include + +{local_ns_open} + +//--- Cubin Arrays +{cubins} + +//--- Cubin Lengths +{cubin_lens} + +{local_ns_close} + +using namespace tensorrt_llm::kernels; + +namespace tensorrt_llm::TRTLLM_ABI_NAMESPACE::kernels {{ + +class Fused_multihead_attention_params_v2; +class Launch_params; + +//--- Kernel Launchers +{launchers} + +// FIXME: These are duplicated declarations, we should remove them in the future. +constexpr int32_t kSM_70 = 70; +constexpr int32_t kSM_72 = 72; +constexpr int32_t kSM_75 = 75; +constexpr int32_t kSM_80 = 80; +constexpr int32_t kSM_86 = 86; +constexpr int32_t kSM_89 = 89; +constexpr int32_t kSM_90 = 90; +constexpr int32_t kSM_100 = 100; +constexpr int32_t kSM_100f = 10100; +constexpr int32_t kSM_103 = 103; +constexpr int32_t kSM_120 = 120; +constexpr int32_t kSM_121 = 121; + +// FIXME: These are duplicated declarations, we should remove them in the future. +enum Data_type +{{ + DATA_TYPE_BOOL, + DATA_TYPE_FP16, + DATA_TYPE_FP32, + DATA_TYPE_INT4, + DATA_TYPE_INT8, + DATA_TYPE_INT32, + DATA_TYPE_BF16, + DATA_TYPE_E2M1, + DATA_TYPE_E4M3, + DATA_TYPE_E5M2 +}}; + +struct FusedMultiHeadAttentionKernelMetaInfoV2 +{{ + Data_type mDataTypeIn; + Data_type mDataTypeOut; + unsigned int mS; + unsigned int mStepQ; + unsigned int mStepKV; + unsigned int mD; + unsigned int mDV; + unsigned int mSageBlockSizeQ; + unsigned int mSageBlockSizeK; + unsigned int mSageBlockSizeV; + unsigned int mSM; + const unsigned char* mCubin; + unsigned int mCubinSize; + const char* mFuncName; + unsigned int mSharedMemBytes; + unsigned int mThreadsPerCTA; + unsigned int mUnrollStep; + int mAttentionMaskType; + int mAttentionInputLayout; + bool mInterleaved; + bool mFlashAttention; + bool mWarpSpecialization; + bool mFP32Accumulation; + bool mAlibiSupported; + bool mTiled; + bool mEnableAttnLogitSoftcapping; + bool mReturnSoftmaxStats;{launcher_line} +}}; + +extern const FusedMultiHeadAttentionKernelMetaInfoV2 sMhaKernelMetaInfosV2[] = {{ +{metadata_v2} +}}; + +extern const int sMhaKernelMetaInfosV2Size = sizeof(sMhaKernelMetaInfosV2) / sizeof(sMhaKernelMetaInfosV2[0]); +}} // namespace tensorrt_llm::TRTLLM_ABI_NAMESPACE::kernels +'''.format(**locals(), copyright=copyright) + else: + # Non-GENERATE_CUBIN mode: use old behavior + header_content = code + source_content = None + + return header_content, source_content # This is used to add some kernels running in cubins for passing CI cases. @@ -3449,9 +3629,20 @@ def modify_cubin_header(cubin_header): return result target = "#ifndef EXCLUDE_SM_80" - addition = """extern unsigned char cubin_fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_128_sm80_cu_cubin[]; -extern uint32_t cubin_fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_128_sm80_cu_cubin_len;""" - result = add_kernel_line(result, target, addition) + addition_cubin_array = """ +#ifndef EXCLUDE_SM_80 +extern unsigned char cubin_fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_128_sm80_cu_cubin[]; +#endif +""" + addition_cubin_length = """ +#ifndef EXCLUDE_SM_80 +extern uint32_t cubin_fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_128_sm80_cu_cubin_len; +#endif +""" + # Add cubin array and length into there corresponding sections. + result = add_kernel_line(result, "//--- Cubin Arrays", addition_cubin_array) + result = add_kernel_line(result, "//--- Cubin Lengths", + addition_cubin_length) def modify_kernel_line(result, target, new_line): lines = result.split('\n') @@ -3534,13 +3725,22 @@ def generate_files(specs_names): output = output.decode('utf-8').strip() # this gives: kname, smem bytes, threads_per_cta, loop_step kernel_traits = [traits.split() for traits in output.splitlines()] - cubin_header = get_cubin_header(kernel_traits, valid_specs_names) + # Use new function to generate both fmha_cubin.h and fmha_cubin.cpp files + # To switch back to old behavior, replace get_cubin_header_and_source with get_cubin_header + cubin_header, cubin_source = get_cubin_header(kernel_traits, + valid_specs_names) if generate_cu_trtllm: - cubin_header = modify_cubin_header(cubin_header) + cubin_source = modify_cubin_header(cubin_source) + # Write fmha_cubin.h file with open('./generated/fmha_cubin.h', 'w') as f: f.write(cubin_header) + # Write fmha_cubin.cpp file (same directory as fmha_cubin.h file) + if cubin_source is not None: + with open('./generated/fmha_cubin.cpp', 'w') as f: + f.write(cubin_source) + def enumerate_hgmma_tma_kernels(specs, sm=90): specs.append( diff --git a/cpp/kernels/xqa/gen_cpp_header.py b/cpp/kernels/xqa/gen_cpp_header.py index 51417bc96a..9513b5d456 100755 --- a/cpp/kernels/xqa/gen_cpp_header.py +++ b/cpp/kernels/xqa/gen_cpp_header.py @@ -127,7 +127,9 @@ TEMPLATE_PROLOGUE = '''/* */ #pragma once -namespace tensorrt_llm { +#include "tensorrt_llm/common/config.h" + +TRTLLM_NAMESPACE_BEGIN namespace kernels { ''' @@ -136,7 +138,8 @@ inline constexpr const char* {fname_var_name} = "{fname}"; ''' TEMPLATE_EPILOGUE = '''} -} +TRTLLM_NAMESPACE_END + ''' D = defaultdict(list) diff --git a/cpp/kernels/xqa/gen_cubins.py b/cpp/kernels/xqa/gen_cubins.py index 2a284f834a..a345861fb7 100755 --- a/cpp/kernels/xqa/gen_cubins.py +++ b/cpp/kernels/xqa/gen_cubins.py @@ -86,8 +86,10 @@ cpp_file_prefix_text = R"""/* * See the License for the specific language governing permissions and * limitations under the License. */ -namespace tensorrt_llm -{ + +#include "tensorrt_llm/common/config.h" + +TRTLLM_NAMESPACE_BEGIN namespace kernels { // clang-format off @@ -96,7 +98,7 @@ namespace kernels cpp_file_suffex_text = R""" // clang-format on } // namespace kernels -} // namespace tensorrt_llm +TRTLLM_NAMESPACE_END """ cubin_meta_info_struct_prefix_text = R""" diff --git a/cpp/tensorrt_llm/common/assert.cpp b/cpp/tensorrt_llm/common/assert.cpp index eaaf662447..4211a9a049 100755 --- a/cpp/tensorrt_llm/common/assert.cpp +++ b/cpp/tensorrt_llm/common/assert.cpp @@ -27,7 +27,7 @@ bool initCheckDebug() } } // namespace -bool DebugConfig::isCheckDebugEnabled() +bool tensorrt_llm::DebugConfig::isCheckDebugEnabled() { static bool const debugEnabled = initCheckDebug(); return debugEnabled; diff --git a/cpp/tensorrt_llm/common/attentionOp.cpp b/cpp/tensorrt_llm/common/attentionOp.cpp index f4ae207321..5994021eb4 100644 --- a/cpp/tensorrt_llm/common/attentionOp.cpp +++ b/cpp/tensorrt_llm/common/attentionOp.cpp @@ -16,6 +16,7 @@ */ #include "attentionOp.h" #include "tensorrt_llm/common/assert.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/envUtils.h" #include "tensorrt_llm/common/logger.h" #include "tensorrt_llm/common/memoryUtils.h" diff --git a/cpp/tensorrt_llm/common/attentionOp.h b/cpp/tensorrt_llm/common/attentionOp.h index f6c78480b6..653b4d65e7 100644 --- a/cpp/tensorrt_llm/common/attentionOp.h +++ b/cpp/tensorrt_llm/common/attentionOp.h @@ -16,6 +16,7 @@ */ #pragma once +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cublasMMWrapper.h" #include "tensorrt_llm/common/opUtils.h" #include "tensorrt_llm/common/quantization.h" @@ -36,7 +37,9 @@ #include #endif // ENABLE_MULTI_DEVICE -namespace tensorrt_llm::common::op +TRTLLM_NAMESPACE_BEGIN + +namespace common::op { class AttentionOp @@ -543,4 +546,6 @@ private: UniqPtrWNullCopy mMultiBlockSemaphores = {}; }; -} // namespace tensorrt_llm::common::op +} // namespace common::op + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/common/cublasMMWrapper.cpp b/cpp/tensorrt_llm/common/cublasMMWrapper.cpp index f3e81defd3..5cbe1b30d3 100644 --- a/cpp/tensorrt_llm/common/cublasMMWrapper.cpp +++ b/cpp/tensorrt_llm/common/cublasMMWrapper.cpp @@ -16,6 +16,7 @@ #include "tensorrt_llm/common/cublasMMWrapper.h" #include "tensorrt_llm/common/assert.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cublasVersionCheck.h" #include #include @@ -24,8 +25,8 @@ #error CUDART_VERSION Undefined! #endif -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace common { @@ -661,4 +662,4 @@ void CublasMMWrapper::BlockScaleGemm(cublasOperation_t transa, cublasOperation_t } // namespace common -} // namespace tensorrt_llm +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/common/cublasMMWrapper.h b/cpp/tensorrt_llm/common/cublasMMWrapper.h index 1ca1dbfee6..78a68204ea 100644 --- a/cpp/tensorrt_llm/common/cublasMMWrapper.h +++ b/cpp/tensorrt_llm/common/cublasMMWrapper.h @@ -16,6 +16,7 @@ #pragma once +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaUtils.h" #include #include @@ -24,8 +25,8 @@ #include #include -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace common { @@ -185,4 +186,4 @@ public: } // namespace common -} // namespace tensorrt_llm +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/common/cudaBf16Fallbacks.cuh b/cpp/tensorrt_llm/common/cudaBf16Fallbacks.cuh index 0519251e6f..583c4991ea 100644 --- a/cpp/tensorrt_llm/common/cudaBf16Fallbacks.cuh +++ b/cpp/tensorrt_llm/common/cudaBf16Fallbacks.cuh @@ -16,12 +16,13 @@ #pragma once +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaBf16Wrapper.h" #include #include -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace common { @@ -291,7 +292,8 @@ inline __device__ __nv_bfloat162 bf16hfma2(__nv_bfloat162 a, __nv_bfloat162 b, _ #endif // ENABLE_BF16 } // namespace common -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END // Operator definitions intentionally in global namespace namespace diff --git a/cpp/tensorrt_llm/common/cudaBufferUtils.cuh b/cpp/tensorrt_llm/common/cudaBufferUtils.cuh index a5da5bbcae..aad5e83cbf 100644 --- a/cpp/tensorrt_llm/common/cudaBufferUtils.cuh +++ b/cpp/tensorrt_llm/common/cudaBufferUtils.cuh @@ -16,6 +16,7 @@ #pragma once +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaUtils.h" #include #include @@ -28,8 +29,8 @@ #include #include -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace common { static __host__ __device__ int hash(int val) @@ -673,4 +674,5 @@ struct MultiProducerCircularBuffer : public CircularBuffer }; } // namespace common -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/common/cudaDriverWrapper.cpp b/cpp/tensorrt_llm/common/cudaDriverWrapper.cpp index c754f39277..b961ef5042 100644 --- a/cpp/tensorrt_llm/common/cudaDriverWrapper.cpp +++ b/cpp/tensorrt_llm/common/cudaDriverWrapper.cpp @@ -18,6 +18,7 @@ #if defined(_WIN32) #include + #define dllOpen(name) LoadLibrary("nv" name ".dll") #define dllClose(handle) FreeLibrary(static_cast(handle)) #define dllGetSym(handle, name) static_cast(GetProcAddress(static_cast(handle), name)) @@ -29,6 +30,7 @@ #endif // defined(_WIN32) #include "tensorrt_llm/common/assert.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaDriverWrapper.h" #include "tensorrt_llm/common/logger.h" #include @@ -36,7 +38,9 @@ #include #include -namespace tensorrt_llm::common +TRTLLM_NAMESPACE_BEGIN + +namespace common { std::shared_ptr CUDADriverWrapper::getInstance() @@ -295,4 +299,6 @@ CUresult CUDADriverWrapper::cuOccupancyMaxActiveClusters( return (*_cuOccupancyMaxActiveClusters)(maxActiveClusters, f, config); } -} // namespace tensorrt_llm::common +} // namespace common + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/common/cudaDriverWrapper.h b/cpp/tensorrt_llm/common/cudaDriverWrapper.h index cc3328993c..236be28fd2 100644 --- a/cpp/tensorrt_llm/common/cudaDriverWrapper.h +++ b/cpp/tensorrt_llm/common/cudaDriverWrapper.h @@ -17,6 +17,7 @@ #ifndef CUDA_DRIVER_WRAPPER_H #define CUDA_DRIVER_WRAPPER_H +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/stringUtils.h" #include "tensorrt_llm/common/tllmException.h" @@ -25,7 +26,9 @@ #include #include -namespace tensorrt_llm::common +TRTLLM_NAMESPACE_BEGIN + +namespace common { class CUDADriverWrapper @@ -165,8 +168,9 @@ void checkDriverExitSafe(T result, char const* const func, char const* const fil } } -} // namespace tensorrt_llm::common +} // namespace common +TRTLLM_NAMESPACE_END /* * Macros compliant with TensorRT coding conventions */ diff --git a/cpp/tensorrt_llm/common/cudaFp8Utils.cu b/cpp/tensorrt_llm/common/cudaFp8Utils.cu index 06afb96b95..39616f100c 100644 --- a/cpp/tensorrt_llm/common/cudaFp8Utils.cu +++ b/cpp/tensorrt_llm/common/cudaFp8Utils.cu @@ -14,6 +14,7 @@ * limitations under the License. */ +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaFp8Utils.h" #include "tensorrt_llm/common/cudaUtils.h" #include "tensorrt_llm/common/envUtils.h" @@ -24,8 +25,8 @@ #include #include -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace common { #ifdef ENABLE_FP8 @@ -466,4 +467,5 @@ DEFINE_INVOKE_QUANTIZE_MATRIX(__nv_bfloat16, float, __nv_fp8_e4m3); #endif // ENABLE_FP8 } // namespace common -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/common/cudaProfilerUtils.cpp b/cpp/tensorrt_llm/common/cudaProfilerUtils.cpp index 5576fe782f..959fa3e906 100644 --- a/cpp/tensorrt_llm/common/cudaProfilerUtils.cpp +++ b/cpp/tensorrt_llm/common/cudaProfilerUtils.cpp @@ -15,6 +15,7 @@ */ #include "tensorrt_llm/common/cudaProfilerUtils.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/logger.h" #include "tensorrt_llm/common/stringUtils.h" #include @@ -54,7 +55,9 @@ std::tuple, std::unordered_set> populateIte } // namespace -namespace tensorrt_llm::common +TRTLLM_NAMESPACE_BEGIN + +namespace common { std::pair, std::unordered_set> populateIterationIndexes( @@ -81,4 +84,6 @@ std::pair, std::unordered_set> populateIter return std::make_pair(profileIterIdxs, stopIterIdxs); } -} // namespace tensorrt_llm::common +} // namespace common + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/common/cudaTypeUtils.cuh b/cpp/tensorrt_llm/common/cudaTypeUtils.cuh index a0463a3a49..157b561d4c 100644 --- a/cpp/tensorrt_llm/common/cudaTypeUtils.cuh +++ b/cpp/tensorrt_llm/common/cudaTypeUtils.cuh @@ -25,9 +25,10 @@ #if ENABLE_BF16 #include #endif +#include "tensorrt_llm/common/config.h" + +TRTLLM_NAMESPACE_BEGIN -namespace tensorrt_llm -{ namespace common { @@ -749,4 +750,5 @@ __device__ inline __nv_fp8_e4m3 cuda_cast<__nv_fp8_e4m3, int8_t>(int8_t val) #endif // ENABLE_FP8 } // namespace common -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/common/customAllReduceUtils.h b/cpp/tensorrt_llm/common/customAllReduceUtils.h index 9a466512e4..4115ac150f 100644 --- a/cpp/tensorrt_llm/common/customAllReduceUtils.h +++ b/cpp/tensorrt_llm/common/customAllReduceUtils.h @@ -16,6 +16,7 @@ #pragma once +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaUtils.h" #include "tensorrt_llm/common/envUtils.h" #include "tensorrt_llm/kernels/customAllReduceKernels.h" @@ -25,7 +26,9 @@ using tensorrt_llm::kernels::AllReduceFusionOp; using tensorrt_llm::kernels::AllReduceStrategyType; -namespace tensorrt_llm::utils::customAllReduceUtils +TRTLLM_NAMESPACE_BEGIN + +namespace utils::customAllReduceUtils { constexpr size_t NUM_POINTERS_PER_RANK = 7; @@ -292,4 +295,6 @@ inline const std::unordered_map AllReduceBe {90, AllReduceBestStrategyTableSM90}, {100, AllReduceBestStrategyTableSM100}, }; -} // namespace tensorrt_llm::utils::customAllReduceUtils +} // namespace utils::customAllReduceUtils + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/common/envUtils.cpp b/cpp/tensorrt_llm/common/envUtils.cpp index 3dfeb91a9e..fc85975acb 100644 --- a/cpp/tensorrt_llm/common/envUtils.cpp +++ b/cpp/tensorrt_llm/common/envUtils.cpp @@ -16,6 +16,7 @@ */ #include "envUtils.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaUtils.h" #include "tensorrt_llm/common/logger.h" #include "tensorrt_llm/common/stringUtils.h" @@ -25,7 +26,9 @@ #include #include -namespace tensorrt_llm::common +TRTLLM_NAMESPACE_BEGIN + +namespace common { std::optional getIntEnv(char const* name) @@ -528,4 +531,6 @@ bool getEnvEplbForceGdrcopy() return getBoolEnv("TRTLLM_EPLB_FORCE_GDRCOPY"); } -} // namespace tensorrt_llm::common +} // namespace common + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/common/envUtils.h b/cpp/tensorrt_llm/common/envUtils.h index 6142781f6a..8a3af2458d 100644 --- a/cpp/tensorrt_llm/common/envUtils.h +++ b/cpp/tensorrt_llm/common/envUtils.h @@ -16,13 +16,16 @@ */ #pragma once +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaUtils.h" #include #include #include #include -namespace tensorrt_llm::common +TRTLLM_NAMESPACE_BEGIN + +namespace common { // Useful when you want to inject some debug code controllable with env var. std::optional getIntEnv(char const* name); @@ -153,4 +156,6 @@ bool getEnvKVCacheTransferAllBlocksForWindow(); bool getEnvEplbForceGdrcopy(); -} // namespace tensorrt_llm::common +} // namespace common + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/common/lamportUtils.cuh b/cpp/tensorrt_llm/common/lamportUtils.cuh index 4713d1a240..9e2f22d1a1 100644 --- a/cpp/tensorrt_llm/common/lamportUtils.cuh +++ b/cpp/tensorrt_llm/common/lamportUtils.cuh @@ -19,6 +19,7 @@ #ifndef TRTLLM_CUDA_LAMPORT_UTILS_CUH #define TRTLLM_CUDA_LAMPORT_UTILS_CUH +#include "tensorrt_llm/common/config.h" #include #include #include @@ -29,7 +30,9 @@ #include "tensorrt_llm/common/cudaTypeUtils.cuh" -namespace tensorrt_llm::common +TRTLLM_NAMESPACE_BEGIN + +namespace common { constexpr uint16_t kNEGZERO_FP16 = 0x8000U; @@ -279,6 +282,7 @@ private: } }; -} // namespace tensorrt_llm::common +} // namespace common +TRTLLM_NAMESPACE_END #endif // TRTLLM_CUDA_LAMPORT_UTILS_CUH diff --git a/cpp/tensorrt_llm/common/logger.cpp b/cpp/tensorrt_llm/common/logger.cpp index 2c2edb5af8..5daa79d92e 100644 --- a/cpp/tensorrt_llm/common/logger.cpp +++ b/cpp/tensorrt_llm/common/logger.cpp @@ -15,12 +15,15 @@ */ #include "tensorrt_llm/common/logger.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaUtils.h" #include "tensorrt_llm/common/stringUtils.h" #include "tensorrt_llm/common/tllmException.h" #include -namespace tensorrt_llm::common +TRTLLM_NAMESPACE_BEGIN + +namespace common { Logger::Logger() @@ -70,4 +73,6 @@ Logger* Logger::getLogger() thread_local Logger instance; return &instance; } -} // namespace tensorrt_llm::common +} // namespace common + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/common/mathUtils.h b/cpp/tensorrt_llm/common/mathUtils.h index 1bad3a2c15..670923dc28 100644 --- a/cpp/tensorrt_llm/common/mathUtils.h +++ b/cpp/tensorrt_llm/common/mathUtils.h @@ -16,10 +16,11 @@ #pragma once +#include "tensorrt_llm/common/config.h" #include -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace common { @@ -34,4 +35,5 @@ inline __device__ __host__ T divUp(T m, T n) //////////////////////////////////////////////////////////////////////////////////////////////////// } // namespace common -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/common/mcastDevMemUtils.cpp b/cpp/tensorrt_llm/common/mcastDevMemUtils.cpp index b490e2bcdb..8dcd6b1985 100644 --- a/cpp/tensorrt_llm/common/mcastDevMemUtils.cpp +++ b/cpp/tensorrt_llm/common/mcastDevMemUtils.cpp @@ -14,11 +14,15 @@ * limitations under the License. */ #include "mcastDevMemUtils.h" +#include "tensorrt_llm/common/config.h" #include -namespace tensorrt_llm::common +using McastDeviceMemory = ::tensorrt_llm::runtime::McastDeviceMemory; + +TRTLLM_NAMESPACE_BEGIN + +namespace common { -using McastDeviceMemory = tensorrt_llm::runtime::McastDeviceMemory; namespace { @@ -84,4 +88,6 @@ McastDeviceMemory* findMcastDevMemBuffer(void* ptr) { return McastDevMemBufferRegistry::getInstance().findBuffer(ptr); } -} // namespace tensorrt_llm::common +} // namespace common + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/common/mcastDevMemUtils.h b/cpp/tensorrt_llm/common/mcastDevMemUtils.h index def72dd044..50c7a48291 100644 --- a/cpp/tensorrt_llm/common/mcastDevMemUtils.h +++ b/cpp/tensorrt_llm/common/mcastDevMemUtils.h @@ -15,13 +15,17 @@ */ #pragma once -// Avoid circular dependency +#include "tensorrt_llm/common/config.h" + namespace tensorrt_llm::runtime { class McastDeviceMemory; -} +} // namespace tensorrt_llm::runtime -namespace tensorrt_llm::common +// Avoid circular dependency +TRTLLM_NAMESPACE_BEGIN + +namespace common { using McastDeviceMemory = tensorrt_llm::runtime::McastDeviceMemory; // Register a buffer with the McastDeviceMemory class. This function does not check if the ptr belongs to the buffer! @@ -31,4 +35,6 @@ void unregisterMcastDevMemBuffer(McastDeviceMemory* buf); // information. Thus a derived pointer cannot used as the key. McastDeviceMemory* findMcastDevMemBuffer(void* ptr); -} // namespace tensorrt_llm::common +} // namespace common + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/common/memoryUtils.cu b/cpp/tensorrt_llm/common/memoryUtils.cu index ff22bbb7c4..fc13db3096 100644 --- a/cpp/tensorrt_llm/common/memoryUtils.cu +++ b/cpp/tensorrt_llm/common/memoryUtils.cu @@ -15,6 +15,7 @@ */ #include "tensorrt_llm/common/assert.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaTypeUtils.cuh" #include "tensorrt_llm/common/logger.h" #include "tensorrt_llm/common/memoryUtils.h" @@ -25,8 +26,8 @@ #include -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace common { @@ -961,4 +962,5 @@ void calcAlignedPointers( } } // namespace common -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/common/memoryUtils.h b/cpp/tensorrt_llm/common/memoryUtils.h index 267c6015b2..f55e422631 100644 --- a/cpp/tensorrt_llm/common/memoryUtils.h +++ b/cpp/tensorrt_llm/common/memoryUtils.h @@ -16,13 +16,14 @@ #pragma once +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaFp8Utils.h" #include "tensorrt_llm/common/cudaUtils.h" #include -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace common { @@ -293,4 +294,5 @@ AlignedPointersUnpacker inline calcAlignedPointers( } } // namespace common -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/common/ncclUtils.h b/cpp/tensorrt_llm/common/ncclUtils.h index d128741e0a..8e5d2c9154 100644 --- a/cpp/tensorrt_llm/common/ncclUtils.h +++ b/cpp/tensorrt_llm/common/ncclUtils.h @@ -16,6 +16,7 @@ #pragma once #include "tensorrt_llm/common/assert.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaUtils.h" #include "tensorrt_llm/common/logger.h" @@ -46,7 +47,9 @@ #include #endif -namespace tensorrt_llm::common::nccl_util +TRTLLM_NAMESPACE_BEGIN + +namespace common::nccl_util { //============================================================================== @@ -392,6 +395,8 @@ inline std::pair createNCCLWindowTensor( return std::make_pair(tensor, buffer); } -} // namespace tensorrt_llm::common::nccl_util +} // namespace common::nccl_util + +TRTLLM_NAMESPACE_END #endif // ENABLE_MULTI_DEVICE diff --git a/cpp/tensorrt_llm/common/nvtxUtils.h b/cpp/tensorrt_llm/common/nvtxUtils.h index 4891a612ba..07f063e913 100644 --- a/cpp/tensorrt_llm/common/nvtxUtils.h +++ b/cpp/tensorrt_llm/common/nvtxUtils.h @@ -25,10 +25,13 @@ #if defined(__clang__) #pragma clang diagnostic pop #endif +#include "tensorrt_llm/common/config.h" #include -namespace tensorrt_llm::common::nvtx +TRTLLM_NAMESPACE_BEGIN + +namespace common::nvtx { inline nvtx3::color nextColor() { @@ -46,8 +49,9 @@ inline nvtx3::color nextColor() #endif } -} // namespace tensorrt_llm::common::nvtx +} // namespace common::nvtx +TRTLLM_NAMESPACE_END #define NVTX3_SCOPED_RANGE_WITH_NAME(range, name) \ ::nvtx3::scoped_range range(::tensorrt_llm::common::nvtx::nextColor(), name) #define NVTX3_SCOPED_RANGE(range) NVTX3_SCOPED_RANGE_WITH_NAME(range##_range, #range) diff --git a/cpp/tensorrt_llm/common/opUtils.cpp b/cpp/tensorrt_llm/common/opUtils.cpp index 72d966e43d..3acdf54843 100644 --- a/cpp/tensorrt_llm/common/opUtils.cpp +++ b/cpp/tensorrt_llm/common/opUtils.cpp @@ -29,6 +29,7 @@ #include #include +TRTLLM_NAMESPACE_BEGIN #if ENABLE_MULTI_DEVICE std::unordered_map* getDtypeMap() @@ -378,3 +379,5 @@ std::shared_ptr getCublasLtHandle() }); return creator(); } + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/common/opUtils.h b/cpp/tensorrt_llm/common/opUtils.h index cb5911fe10..3018a5da10 100644 --- a/cpp/tensorrt_llm/common/opUtils.h +++ b/cpp/tensorrt_llm/common/opUtils.h @@ -17,6 +17,7 @@ #pragma once +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cublasMMWrapper.h" #include "tensorrt_llm/common/workspace.h" @@ -37,7 +38,9 @@ #include #include -namespace tensorrt_llm::common::op +TRTLLM_NAMESPACE_BEGIN + +namespace common::op { // Write values into buffer @@ -178,7 +181,7 @@ struct hash // for testing only void const* getCommSessionHandle(); -} // namespace tensorrt_llm::common::op +} // namespace common::op inline bool isBuilding() { @@ -220,6 +223,8 @@ std::shared_ptr getComm(std::set const& group); std::shared_ptr getCublasHandle(); std::shared_ptr getCublasLtHandle(); +TRTLLM_NAMESPACE_END + #ifndef DEBUG #define PLUGIN_CHECK(status) \ diff --git a/cpp/tensorrt_llm/common/quantTypeUtils.cuh b/cpp/tensorrt_llm/common/quantTypeUtils.cuh index a228d3f9fc..bfe924b109 100644 --- a/cpp/tensorrt_llm/common/quantTypeUtils.cuh +++ b/cpp/tensorrt_llm/common/quantTypeUtils.cuh @@ -16,14 +16,15 @@ #pragma once +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaBf16Fallbacks.cuh" #include "tensorrt_llm/common/cudaFp8Utils.h" #include #include #include -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace common { @@ -52,4 +53,5 @@ struct QuantTypeStaticVals<__nv_fp8_e4m3> #endif // ENABLE_FP8 } // namespace common -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/common/reduceKernelUtils.cuh b/cpp/tensorrt_llm/common/reduceKernelUtils.cuh index 04af7e4ec5..485a4aedb4 100644 --- a/cpp/tensorrt_llm/common/reduceKernelUtils.cuh +++ b/cpp/tensorrt_llm/common/reduceKernelUtils.cuh @@ -21,6 +21,7 @@ #else #include #endif +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaTypeUtils.cuh" #include #include @@ -30,8 +31,8 @@ namespace cg = cooperative_groups; -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace common { @@ -423,4 +424,5 @@ __device__ __forceinline__ half clamp_inf_for_half(float const input) } } // namespace common -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/common/safetensors.cpp b/cpp/tensorrt_llm/common/safetensors.cpp index d948e91146..9171f79e44 100644 --- a/cpp/tensorrt_llm/common/safetensors.cpp +++ b/cpp/tensorrt_llm/common/safetensors.cpp @@ -17,6 +17,7 @@ #include "safetensors.h" #include "nlohmann/json.hpp" #include "tensorrt_llm/common/assert.h" +#include "tensorrt_llm/common/config.h" #include #include #include @@ -25,7 +26,9 @@ #include #include -namespace tensorrt_llm::common::safetensors +TRTLLM_NAMESPACE_BEGIN + +namespace common::safetensors { using nvinfer1::DataType; @@ -164,4 +167,6 @@ std::shared_ptr ISafeTensor::open(char const* filename) { return std::make_shared(filename); } -} // namespace tensorrt_llm::common::safetensors +} // namespace common::safetensors + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/common/safetensors.h b/cpp/tensorrt_llm/common/safetensors.h index 3af8d959be..e31225f1be 100644 --- a/cpp/tensorrt_llm/common/safetensors.h +++ b/cpp/tensorrt_llm/common/safetensors.h @@ -16,6 +16,7 @@ #pragma once #include "tensorrt_llm/common/assert.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/logger.h" #include #include @@ -23,7 +24,9 @@ #include #include -namespace tensorrt_llm::common::safetensors +TRTLLM_NAMESPACE_BEGIN + +namespace common::safetensors { class INdArray { @@ -58,4 +61,6 @@ public: virtual ~ISafeTensor() = default; }; -} // namespace tensorrt_llm::common::safetensors +} // namespace common::safetensors + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/common/stlUtils.h b/cpp/tensorrt_llm/common/stlUtils.h index 9cda9fa0d4..7b12fd6d34 100644 --- a/cpp/tensorrt_llm/common/stlUtils.h +++ b/cpp/tensorrt_llm/common/stlUtils.h @@ -16,12 +16,15 @@ #pragma once +#include "tensorrt_llm/common/config.h" #include #include #include #include -namespace tensorrt_llm::common::stl_utils +TRTLLM_NAMESPACE_BEGIN + +namespace common::stl_utils { template @@ -120,4 +123,6 @@ std::string toString(std::optional const& t, typename std::enable_if_t #include @@ -23,7 +24,9 @@ #include #include -namespace tensorrt_llm::common +TRTLLM_NAMESPACE_BEGIN + +namespace common { void fmtstr_(char const* format, fmtstr_allocator alloc, void* target, va_list args) @@ -73,4 +76,6 @@ std::unordered_set str2set(std::string const& input, char delimiter return values; }; -} // namespace tensorrt_llm::common +} // namespace common + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/common/timestampUtils.cpp b/cpp/tensorrt_llm/common/timestampUtils.cpp index c00041abda..66c01fbd7a 100644 --- a/cpp/tensorrt_llm/common/timestampUtils.cpp +++ b/cpp/tensorrt_llm/common/timestampUtils.cpp @@ -14,13 +14,16 @@ * limitations under the License. */ +#include "tensorrt_llm/common/config.h" #include #include #include #include "tensorrt_llm/common/timestampUtils.h" -namespace tensorrt_llm::common +TRTLLM_NAMESPACE_BEGIN + +namespace common { std::string getCurrentTimestamp() @@ -39,4 +42,6 @@ std::string getCurrentTimestamp() return stream.str(); } -} // namespace tensorrt_llm::common +} // namespace common + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/common/timestampUtils.h b/cpp/tensorrt_llm/common/timestampUtils.h index f52f23028c..92a9c0e38f 100644 --- a/cpp/tensorrt_llm/common/timestampUtils.h +++ b/cpp/tensorrt_llm/common/timestampUtils.h @@ -14,12 +14,17 @@ * limitations under the License. */ +#include "tensorrt_llm/common/config.h" #include -namespace tensorrt_llm::common +TRTLLM_NAMESPACE_BEGIN + +namespace common { /// @brief Get the current timestamp in the format "MM-DD-YYYY HH:MM:SS:uuuuuu" std::string getCurrentTimestamp(); -} // namespace tensorrt_llm::common +} // namespace common + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/common/tllmException.cpp b/cpp/tensorrt_llm/common/tllmException.cpp index a6aaa5e259..1b71fe5572 100644 --- a/cpp/tensorrt_llm/common/tllmException.cpp +++ b/cpp/tensorrt_llm/common/tllmException.cpp @@ -15,6 +15,7 @@ */ #include "tensorrt_llm/common/tllmException.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/stringUtils.h" #include @@ -26,7 +27,9 @@ #endif #include -namespace tensorrt_llm::common +TRTLLM_NAMESPACE_BEGIN + +namespace common { namespace @@ -128,4 +131,6 @@ RequestErrorCode RequestSpecificException::getErrorCode() const noexcept return mErrorCode; } -} // namespace tensorrt_llm::common +} // namespace common + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/common/workspace.h b/cpp/tensorrt_llm/common/workspace.h index 0dd32ed16d..c92d02fa9d 100644 --- a/cpp/tensorrt_llm/common/workspace.h +++ b/cpp/tensorrt_llm/common/workspace.h @@ -14,10 +14,13 @@ * limitations under the License. */ #pragma once +#include "tensorrt_llm/common/config.h" #include #include -namespace tensorrt_llm::common +TRTLLM_NAMESPACE_BEGIN + +namespace common { // CuBLAS >= 12.9.1 requires 256-byte alignment. @@ -85,4 +88,6 @@ inline size_t calculateTotalWorkspaceSize( return total; } -}; // namespace tensorrt_llm::common +} // namespace common + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/compute_occupancy.h b/cpp/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/compute_occupancy.h index c83a9a074d..c49cd09cdb 100644 --- a/cpp/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/compute_occupancy.h +++ b/cpp/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/compute_occupancy.h @@ -18,10 +18,11 @@ #include #include "cutlass/device_kernel.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaUtils.h" -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace cutlass_extensions { @@ -85,4 +86,5 @@ inline int compute_occupancy_for_kernel() } } // namespace cutlass_extensions -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/epilogue_helpers.h b/cpp/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/epilogue_helpers.h index 032f411f17..c6326ef0fe 100644 --- a/cpp/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/epilogue_helpers.h +++ b/cpp/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/epilogue_helpers.h @@ -30,10 +30,11 @@ #include "cutlass/epilogue/thread/linear_combination_relu.h" #include "cutlass/epilogue/thread/linear_combination_silu.h" #include "cutlass_extensions/epilogue/thread/fused_activations.h" +#include "tensorrt_llm/common/config.h" #include -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace cutlass_extensions { @@ -150,4 +151,5 @@ struct Epilogue const& modelPathOpt, std::optional const& engineBufferOpt, runtime::GptJsonConfig const& jsonConfig, - ExecutorConfig const& executorConfig, bool isEncoder, + ::tensorrt_llm::executor::ExecutorConfig const& executorConfig, bool isEncoder, std::optional> const& managedWeightsOpt) { auto const gpusPerNode = jsonConfig.getGpusPerNode(); @@ -288,7 +289,7 @@ void Executor::Impl::loadModel(std::optional const& model Executor::Impl::Impl(std::filesystem::path const& modelPath, std::optional const& encoderModelPath, ModelType const modelType, - ExecutorConfig const& executorConfig) + ::tensorrt_llm::executor::ExecutorConfig const& executorConfig) { auto decoderJsonConfig = runtime::GptJsonConfig::parse(modelPath / "config.json"); @@ -329,7 +330,7 @@ Executor::Impl::Impl(std::filesystem::path const& modelPath, Executor::Impl::Impl(BufferView const& engineBufferView, std::string const& jsonConfigStr, std::optional const& encoderEngineBufferView, std::optional const& encoderJsonConfigStr, - ModelType const modelType, ExecutorConfig const& executorConfig, + ModelType const modelType, ::tensorrt_llm::executor::ExecutorConfig const& executorConfig, std::optional> const& managedWeightsOpt) { auto decoderJsonConfig = runtime::GptJsonConfig::parse(jsonConfigStr); @@ -367,7 +368,7 @@ Executor::Impl::Impl(BufferView const& engineBufferView, std::string const& json } Executor::Impl::Impl(std::shared_ptr model, std::optional> encoderModel, - ExecutorConfig const& executorConfig) + ::tensorrt_llm::executor::ExecutorConfig const& executorConfig) { auto const& worldConfig = model->getWorldConfig(); auto const tp = worldConfig.getTensorParallelism(); @@ -388,7 +389,7 @@ Executor::Impl::~Impl() shutdown(); } -void Executor::Impl::initialize(ExecutorConfig const& executorConfig) +void Executor::Impl::initialize(::tensorrt_llm::executor::ExecutorConfig const& executorConfig) { TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__); @@ -484,7 +485,7 @@ void Executor::Impl::initialize(ExecutorConfig const& executorConfig) std::shared_ptr Executor::Impl::createModel(runtime::RawEngine const& rawEngine, runtime::ModelConfig const& modelConfig, runtime::WorldConfig const& worldConfig, - ExecutorConfig const& executorConfig) + ::tensorrt_llm::executor::ExecutorConfig const& executorConfig) { auto const gptModelType = [&executorConfig, &modelConfig]() { @@ -512,7 +513,7 @@ std::shared_ptr Executor::Impl::createModel(runtime::RawEngine const& raw std::shared_ptr Executor::Impl::createEncoderModel(runtime::RawEngine const& rawEngine, runtime::ModelConfig const& modelConfig, runtime::WorldConfig const& worldConfig, - ExecutorConfig const& executorConfig) + ::tensorrt_llm::executor::ExecutorConfig const& executorConfig) { auto fixedExecutorConfig = ExecutorConfig{}; fixedExecutorConfig.setSchedulerConfig(executorConfig.getSchedulerConfig()); @@ -579,7 +580,7 @@ void Executor::Impl::setOrchLeaderComm( } void Executor::Impl::initializeCommAndWorkers(SizeType32 tp, SizeType32 pp, SizeType32 cp, - ExecutorConfig const& executorConfig, std::optional modelType, + ::tensorrt_llm::executor::ExecutorConfig const& executorConfig, std::optional modelType, std::optional const& modelPath, std::optional const& worldConfig, std::optional const& decoderGptJsonConfig) { @@ -638,7 +639,7 @@ void Executor::Impl::validateParallelConfig(ParallelConfig const& parallelConfig } void Executor::Impl::initializeOrchestrator(SizeType32 tp, SizeType32 pp, SizeType32 cp, - ExecutorConfig const& executorConfig, ParallelConfig parallelConfig, ModelType modelType, + ::tensorrt_llm::executor::ExecutorConfig const& executorConfig, ParallelConfig parallelConfig, ModelType modelType, std::filesystem::path const& modelPath) { #if ENABLE_MULTI_DEVICE diff --git a/cpp/tensorrt_llm/kernels/IndexerKCacheScatter.h b/cpp/tensorrt_llm/kernels/IndexerKCacheScatter.h index b0ac689d38..9e316c0b4e 100644 --- a/cpp/tensorrt_llm/kernels/IndexerKCacheScatter.h +++ b/cpp/tensorrt_llm/kernels/IndexerKCacheScatter.h @@ -16,9 +16,12 @@ #pragma once +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaUtils.h" -namespace tensorrt_llm::kernels +TRTLLM_NAMESPACE_BEGIN + +namespace kernels { void invokeIndexerKCacheScatter(uint8_t const* k_fp8_bytes, uint8_t const* k_scale_bytes, uint8_t* k_cache, @@ -28,3 +31,5 @@ void invokeIndexerKCacheScatter(uint8_t const* k_fp8_bytes, uint8_t const* k_sca cudaStream_t stream = 0); } + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/IndexerTopK.h b/cpp/tensorrt_llm/kernels/IndexerTopK.h index 546d18d7a4..e4c79a3f1b 100644 --- a/cpp/tensorrt_llm/kernels/IndexerTopK.h +++ b/cpp/tensorrt_llm/kernels/IndexerTopK.h @@ -17,12 +17,15 @@ #pragma once +#include "tensorrt_llm/common/config.h" #include #include #include "tensorrt_llm/common/cudaUtils.h" -namespace tensorrt_llm::kernels +TRTLLM_NAMESPACE_BEGIN + +namespace kernels { void invokeIndexerTopKDecode(float const* logits, int const* seqLens, int* indices, float* outLogitsAux, int* outIndicesAux, int const splitWorkThreshold, int const numRows, int const numColumns, int const stride0, @@ -32,4 +35,6 @@ void invokeIndexerTopKPrefill(float const* logits, int const* rowStarts, int con int const numRows, int const numColumns, int const stride0, int const stride1, int const topK = 2048, cudaStream_t const stream = 0); -} // namespace tensorrt_llm::kernels +} // namespace kernels + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/attentionMask.cu b/cpp/tensorrt_llm/kernels/attentionMask.cu index 64514a926a..a31b3e1ae7 100644 --- a/cpp/tensorrt_llm/kernels/attentionMask.cu +++ b/cpp/tensorrt_llm/kernels/attentionMask.cu @@ -15,6 +15,7 @@ */ #include "attentionMask.h" #include "tensorrt_llm/common/assert.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaBf16Wrapper.h" #include "tensorrt_llm/common/cudaFp8Utils.h" #include "tensorrt_llm/common/cudaUtils.h" @@ -24,8 +25,8 @@ using namespace tensorrt_llm::common; -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { @@ -231,4 +232,5 @@ template void invokeBuildAttentionMask(AttentionMaskParams<__nv_fp8_e4m3> const& //////////////////////////////////////////////////////////////////////////////////////////////////// } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/attentionMask.h b/cpp/tensorrt_llm/kernels/attentionMask.h index fcfafb3df7..f3a4bf62c7 100644 --- a/cpp/tensorrt_llm/kernels/attentionMask.h +++ b/cpp/tensorrt_llm/kernels/attentionMask.h @@ -15,6 +15,7 @@ */ #pragma once +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaUtils.h" #include "tensorrt_llm/kernels/gptKernels.h" #include "tensorrt_llm/runtime/iTensor.h" @@ -25,8 +26,8 @@ namespace tc = tensorrt_llm::common; -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { @@ -64,4 +65,5 @@ template void invokeBuildAttentionMask(AttentionMaskParams const& params, cudaStream_t stream); } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/banBadWords.cu b/cpp/tensorrt_llm/kernels/banBadWords.cu index 53b55e8adc..c5f7799726 100644 --- a/cpp/tensorrt_llm/kernels/banBadWords.cu +++ b/cpp/tensorrt_llm/kernels/banBadWords.cu @@ -14,14 +14,15 @@ * limitations under the License. */ +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaUtils.h" #include "tensorrt_llm/kernels/banBadWords.h" using namespace tensorrt_llm::common; using namespace tensorrt_llm::runtime; -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { @@ -130,4 +131,5 @@ template void invokeBanBadWords(float* logits, TokenIdType const** output_ids_pt SizeType32 const* sequence_lengths, SizeType32 max_seq_len, cudaStream_t stream); } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/banBadWords.h b/cpp/tensorrt_llm/kernels/banBadWords.h index 1057c45911..39fa10fdba 100644 --- a/cpp/tensorrt_llm/kernels/banBadWords.h +++ b/cpp/tensorrt_llm/kernels/banBadWords.h @@ -16,12 +16,13 @@ #pragma once +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/runtime/common.h" #include #include -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { @@ -34,4 +35,5 @@ void invokeBanBadWords(T* logits, runtime::TokenIdType const** output_ids_ptr, cudaStream_t stream); } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/banRepeatNgram.cu b/cpp/tensorrt_llm/kernels/banRepeatNgram.cu index 9011811b45..e2d06f857d 100644 --- a/cpp/tensorrt_llm/kernels/banRepeatNgram.cu +++ b/cpp/tensorrt_llm/kernels/banRepeatNgram.cu @@ -14,14 +14,15 @@ * limitations under the License. */ +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaUtils.h" #include "tensorrt_llm/kernels/banRepeatNgram.h" using namespace tensorrt_llm::common; using namespace tensorrt_llm::runtime; -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { @@ -178,4 +179,4 @@ INVOKE_BAN_REPEAT_NGRAM(__nv_bfloat16) } // namespace kernels -} // namespace tensorrt_llm +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/banRepeatNgram.h b/cpp/tensorrt_llm/kernels/banRepeatNgram.h index 8218331734..5541dc4bca 100644 --- a/cpp/tensorrt_llm/kernels/banRepeatNgram.h +++ b/cpp/tensorrt_llm/kernels/banRepeatNgram.h @@ -16,13 +16,14 @@ #pragma once +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/kernels/decodingCommon.h" #include "tensorrt_llm/runtime/common.h" #include #include -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { @@ -34,4 +35,5 @@ void invokeBanRepeatNgram(T* logits, runtime::TokenIdType const** output_ids_buf runtime::SizeType32 vocab_size_padded, runtime::SizeType32 max_step, cudaStream_t stream); } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/beamSearchKernels.cu b/cpp/tensorrt_llm/kernels/beamSearchKernels.cu index ff5f5347b4..005a153916 100644 --- a/cpp/tensorrt_llm/kernels/beamSearchKernels.cu +++ b/cpp/tensorrt_llm/kernels/beamSearchKernels.cu @@ -14,13 +14,14 @@ * limitations under the License. */ +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaUtils.h" #include "tensorrt_llm/kernels/beamSearchKernels.h" using namespace tensorrt_llm::common; -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { @@ -355,4 +356,5 @@ template void printLogProbs(float const* x, int const nBS, int const nBMI template void printLogProbs(half const* x, int const nBS, int const nBMIn, int const nBM, int const nV); } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/beamSearchKernels.h b/cpp/tensorrt_llm/kernels/beamSearchKernels.h index ebf41d7787..d8a9266e94 100644 --- a/cpp/tensorrt_llm/kernels/beamSearchKernels.h +++ b/cpp/tensorrt_llm/kernels/beamSearchKernels.h @@ -15,6 +15,7 @@ */ #pragma once +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaUtils.h" #include "tensorrt_llm/kernels/decodingCommon.h" #include "tensorrt_llm/kernels/topkLastDim.h" // Air TopK @@ -22,8 +23,8 @@ #define BEAM_SEARCH_DEBUG 0 -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { static size_t constexpr kMaxBeamWidth = 1024; // Max beam width supported in TRT-LLM now @@ -88,7 +89,7 @@ struct BeamHypotheses // Pointers related to beam search process, they are initialized in those two functions: // [gptDecoder.cpp] GptDecoder::forward or [dynamicDecodeOp.cpp] FtDynamicDecode::forward bool* batchDones{nullptr}; // [BS] %% self.beam_hyps_is_done whether a whole batch is finished - FinishedState* finished{nullptr}; // [BS*BM], uint8 %% self.finished whether and how a beam is finished + ::tensorrt_llm::kernels::FinishedState* finished{nullptr}; // [BS*BM], uint8 %% self.finished whether and how a beam is finished // Pointers for backtrack of the beams, they are relocated in [dynamicDecodeLayer.cpp] DynamicDecodeLayer::prepareIdsPtrs int** outputIdsPtr{nullptr}; // [BS][BM, MSL] %% self.output_ids @@ -131,11 +132,11 @@ void invokeUpdateCacheIndirection(int* tgtCI, int const* srcCI, BeamHypotheses& runtime::SizeType32 const maxAttentionWindow, runtime::SizeType32 sinkTokenLength, cudaStream_t stream); __global__ void addCumLogProbs(float* __restrict pStage1LogProbs, float const* __restrict cumLogProbs, - FinishedState const* finished, int const* endIds, float const* diversityRates, + ::tensorrt_llm::kernels::FinishedState const* finished, int const* endIds, float const* diversityRates, runtime::SizeType32 const* batchSlots, size_t const nBS, size_t const nBMIn, size_t const nBMOut, size_t const nBM); __global__ void addCumLogProbs(half* __restrict pStage1LogProbs, float const* __restrict cumLogProbs, - FinishedState const* finished, int const* endIds, float const* diversityRates, + ::tensorrt_llm::kernels::FinishedState const* finished, int const* endIds, float const* diversityRates, runtime::SizeType32 const* batchSlots, size_t const nBS, size_t const nBMIn, size_t const nBMOut, size_t const nBM); __global__ void gatherId(int const* __restrict pStage1Id, int* __restrict pStage2Id, size_t const nBS, @@ -219,4 +220,5 @@ void printLogProbs(float const* x, int const nBS, int const nBMIn, int const nBM #endif } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/beamSearchKernels/beamSearchKernels1024.cu b/cpp/tensorrt_llm/kernels/beamSearchKernels/beamSearchKernels1024.cu index 2d611b877f..4d60055585 100644 --- a/cpp/tensorrt_llm/kernels/beamSearchKernels/beamSearchKernels1024.cu +++ b/cpp/tensorrt_llm/kernels/beamSearchKernels/beamSearchKernels1024.cu @@ -15,9 +15,10 @@ */ #include "beamSearchKernelsTemplate.h" +#include "tensorrt_llm/common/config.h" + +TRTLLM_NAMESPACE_BEGIN -namespace tensorrt_llm -{ namespace kernels { @@ -28,4 +29,5 @@ INSTANTIATE_BEAM_SEARCH(half, 1024, true); #endif // FAST_BUILD } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/beamSearchKernels/beamSearchKernels128.cu b/cpp/tensorrt_llm/kernels/beamSearchKernels/beamSearchKernels128.cu index c76929186c..bf23a844b9 100644 --- a/cpp/tensorrt_llm/kernels/beamSearchKernels/beamSearchKernels128.cu +++ b/cpp/tensorrt_llm/kernels/beamSearchKernels/beamSearchKernels128.cu @@ -15,9 +15,10 @@ */ #include "beamSearchKernelsTemplate.h" +#include "tensorrt_llm/common/config.h" + +TRTLLM_NAMESPACE_BEGIN -namespace tensorrt_llm -{ namespace kernels { @@ -28,4 +29,5 @@ INSTANTIATE_BEAM_SEARCH(half, 128, true); #endif // FAST_BUILD } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/beamSearchKernels/beamSearchKernels16.cu b/cpp/tensorrt_llm/kernels/beamSearchKernels/beamSearchKernels16.cu index 698459cfa1..50bf27b142 100644 --- a/cpp/tensorrt_llm/kernels/beamSearchKernels/beamSearchKernels16.cu +++ b/cpp/tensorrt_llm/kernels/beamSearchKernels/beamSearchKernels16.cu @@ -15,13 +15,15 @@ */ #include "beamSearchKernelsTemplate.h" +#include "tensorrt_llm/common/config.h" + +TRTLLM_NAMESPACE_BEGIN -namespace tensorrt_llm -{ namespace kernels { // Skip V1 kernels if beam_width > kMaxBeamWidthForV1 INSTANTIATE_BEAM_SEARCH(float, 16, true); INSTANTIATE_BEAM_SEARCH(half, 16, true); } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/beamSearchKernels/beamSearchKernels256.cu b/cpp/tensorrt_llm/kernels/beamSearchKernels/beamSearchKernels256.cu index 1ba2498129..fae7cd927e 100644 --- a/cpp/tensorrt_llm/kernels/beamSearchKernels/beamSearchKernels256.cu +++ b/cpp/tensorrt_llm/kernels/beamSearchKernels/beamSearchKernels256.cu @@ -15,9 +15,10 @@ */ #include "beamSearchKernelsTemplate.h" +#include "tensorrt_llm/common/config.h" + +TRTLLM_NAMESPACE_BEGIN -namespace tensorrt_llm -{ namespace kernels { @@ -28,4 +29,5 @@ INSTANTIATE_BEAM_SEARCH(half, 256, true); #endif // FAST_BUILD } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/beamSearchKernels/beamSearchKernels32.cu b/cpp/tensorrt_llm/kernels/beamSearchKernels/beamSearchKernels32.cu index 9e7f528725..d414d268c0 100644 --- a/cpp/tensorrt_llm/kernels/beamSearchKernels/beamSearchKernels32.cu +++ b/cpp/tensorrt_llm/kernels/beamSearchKernels/beamSearchKernels32.cu @@ -15,9 +15,10 @@ */ #include "beamSearchKernelsTemplate.h" +#include "tensorrt_llm/common/config.h" + +TRTLLM_NAMESPACE_BEGIN -namespace tensorrt_llm -{ namespace kernels { @@ -28,4 +29,5 @@ INSTANTIATE_BEAM_SEARCH(half, 32, true); #endif // FAST_BUILD } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/beamSearchKernels/beamSearchKernels4.cu b/cpp/tensorrt_llm/kernels/beamSearchKernels/beamSearchKernels4.cu index ce74250dbc..d1815d85e3 100644 --- a/cpp/tensorrt_llm/kernels/beamSearchKernels/beamSearchKernels4.cu +++ b/cpp/tensorrt_llm/kernels/beamSearchKernels/beamSearchKernels4.cu @@ -15,9 +15,10 @@ */ #include "beamSearchKernelsTemplate.h" +#include "tensorrt_llm/common/config.h" + +TRTLLM_NAMESPACE_BEGIN -namespace tensorrt_llm -{ namespace kernels { INSTANTIATE_BEAM_SEARCH(float, 4, false); @@ -25,4 +26,5 @@ INSTANTIATE_BEAM_SEARCH(float, 4, true); INSTANTIATE_BEAM_SEARCH(half, 4, false); INSTANTIATE_BEAM_SEARCH(half, 4, true); } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/beamSearchKernels/beamSearchKernels512.cu b/cpp/tensorrt_llm/kernels/beamSearchKernels/beamSearchKernels512.cu index dd5f78a35f..005f44e5e7 100644 --- a/cpp/tensorrt_llm/kernels/beamSearchKernels/beamSearchKernels512.cu +++ b/cpp/tensorrt_llm/kernels/beamSearchKernels/beamSearchKernels512.cu @@ -15,9 +15,10 @@ */ #include "beamSearchKernelsTemplate.h" +#include "tensorrt_llm/common/config.h" + +TRTLLM_NAMESPACE_BEGIN -namespace tensorrt_llm -{ namespace kernels { @@ -28,4 +29,5 @@ INSTANTIATE_BEAM_SEARCH(half, 512, true); #endif // FAST_BUILD } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/beamSearchKernels/beamSearchKernels64.cu b/cpp/tensorrt_llm/kernels/beamSearchKernels/beamSearchKernels64.cu index 65a43f9b4d..87a34b2d07 100644 --- a/cpp/tensorrt_llm/kernels/beamSearchKernels/beamSearchKernels64.cu +++ b/cpp/tensorrt_llm/kernels/beamSearchKernels/beamSearchKernels64.cu @@ -15,9 +15,10 @@ */ #include "beamSearchKernelsTemplate.h" +#include "tensorrt_llm/common/config.h" + +TRTLLM_NAMESPACE_BEGIN -namespace tensorrt_llm -{ namespace kernels { @@ -28,4 +29,5 @@ INSTANTIATE_BEAM_SEARCH(half, 64, true); #endif // FAST_BUILD } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/beamSearchKernels/beamSearchKernels8.cu b/cpp/tensorrt_llm/kernels/beamSearchKernels/beamSearchKernels8.cu index e1161ddc6d..7b84b37050 100644 --- a/cpp/tensorrt_llm/kernels/beamSearchKernels/beamSearchKernels8.cu +++ b/cpp/tensorrt_llm/kernels/beamSearchKernels/beamSearchKernels8.cu @@ -15,9 +15,10 @@ */ #include "beamSearchKernelsTemplate.h" +#include "tensorrt_llm/common/config.h" + +TRTLLM_NAMESPACE_BEGIN -namespace tensorrt_llm -{ namespace kernels { INSTANTIATE_BEAM_SEARCH(float, 8, false); @@ -25,4 +26,5 @@ INSTANTIATE_BEAM_SEARCH(float, 8, true); INSTANTIATE_BEAM_SEARCH(half, 8, false); INSTANTIATE_BEAM_SEARCH(half, 8, true); } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/beamSearchKernels/beamSearchKernelsTemplate.h b/cpp/tensorrt_llm/kernels/beamSearchKernels/beamSearchKernelsTemplate.h index 331590c526..6ae82e5ad8 100644 --- a/cpp/tensorrt_llm/kernels/beamSearchKernels/beamSearchKernelsTemplate.h +++ b/cpp/tensorrt_llm/kernels/beamSearchKernels/beamSearchKernelsTemplate.h @@ -18,11 +18,13 @@ #error CUDART_VERSION Undefined! #elif (CUDART_VERSION >= 11050) #include + #else #include "3rdparty/cub/cub.cuh" #endif #include "tensorrt_llm/common/assert.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaUtils.h" #include "tensorrt_llm/common/reduceKernelUtils.cuh" #include "tensorrt_llm/common/stringUtils.h" @@ -31,8 +33,8 @@ using namespace tensorrt_llm::common; -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { @@ -731,4 +733,5 @@ void beamSearchKernelLauncher( T const* logProbs, T const* bias, void* workspace, BeamHypotheses& bh, cudaStream_t stream); } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/buildRelativeAttentionBiasKernel.cu b/cpp/tensorrt_llm/kernels/buildRelativeAttentionBiasKernel.cu index 951492b5ff..398ea05260 100644 --- a/cpp/tensorrt_llm/kernels/buildRelativeAttentionBiasKernel.cu +++ b/cpp/tensorrt_llm/kernels/buildRelativeAttentionBiasKernel.cu @@ -14,12 +14,12 @@ * limitations under the License. */ +#include "buildRelativeAttentionBiasKernel.h" +#include "tensorrt_llm/common/config.h" #include -#include "buildRelativeAttentionBiasKernel.h" +TRTLLM_NAMESPACE_BEGIN -namespace tensorrt_llm -{ namespace kernels { @@ -99,4 +99,5 @@ template void invokeBuildRelativeAttentionBias<__nv_bfloat16>(__nv_bfloat16* rel #endif } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/buildRelativeAttentionBiasKernel.h b/cpp/tensorrt_llm/kernels/buildRelativeAttentionBiasKernel.h index 67f622345d..bdeea2b2af 100644 --- a/cpp/tensorrt_llm/kernels/buildRelativeAttentionBiasKernel.h +++ b/cpp/tensorrt_llm/kernels/buildRelativeAttentionBiasKernel.h @@ -17,10 +17,11 @@ #pragma once #include "tensorrt_llm/common/assert.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaUtils.h" -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { @@ -30,4 +31,5 @@ void invokeBuildRelativeAttentionBias(T* relative_attention_bias, T const* relat cudaStream_t stream); } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/causalConv1d/causalConv1d.cu b/cpp/tensorrt_llm/kernels/causalConv1d/causalConv1d.cu index 39b8136d25..8ec6bbbf82 100644 --- a/cpp/tensorrt_llm/kernels/causalConv1d/causalConv1d.cu +++ b/cpp/tensorrt_llm/kernels/causalConv1d/causalConv1d.cu @@ -19,12 +19,15 @@ * limitations under the License. */ +#include "tensorrt_llm/common/config.h" #include #include #include "tensorrt_llm/kernels/causalConv1d/causalConv1d.h" -namespace tensorrt_llm::kernels::causal_conv1d +TRTLLM_NAMESPACE_BEGIN + +namespace kernels::causal_conv1d { template @@ -490,4 +493,6 @@ template void causal_conv1d_update_cuda(ConvParamsBase& params, cu template void causal_conv1d_update_cuda(ConvParamsBase& params, cudaStream_t stream); template void causal_conv1d_update_cuda(ConvParamsBase& params, cudaStream_t stream); -} // namespace tensorrt_llm::kernels::causal_conv1d +} // namespace kernels::causal_conv1d + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/causalConv1d/causalConv1d.h b/cpp/tensorrt_llm/kernels/causalConv1d/causalConv1d.h index 53c9b042c4..2597ebbb30 100644 --- a/cpp/tensorrt_llm/kernels/causalConv1d/causalConv1d.h +++ b/cpp/tensorrt_llm/kernels/causalConv1d/causalConv1d.h @@ -20,11 +20,14 @@ #pragma once +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaUtils.h" #include #include -namespace tensorrt_llm::kernels::causal_conv1d +TRTLLM_NAMESPACE_BEGIN + +namespace kernels::causal_conv1d { #define TLLM_CUDA_KERNEL_LAUNCH_CHECK() TLLM_CUDA_CHECK(cudaGetLastError()) @@ -214,4 +217,6 @@ void causal_conv1d_fwd_cuda(ConvParamsBase& params, cudaStream_t stream); template void causal_conv1d_update_cuda(ConvParamsBase& params, cudaStream_t stream); -} // namespace tensorrt_llm::kernels::causal_conv1d +} // namespace kernels::causal_conv1d + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/communicationKernels/allReduceFusionKernels.cu b/cpp/tensorrt_llm/kernels/communicationKernels/allReduceFusionKernels.cu index 785285bddd..2f6ac3fab7 100644 --- a/cpp/tensorrt_llm/kernels/communicationKernels/allReduceFusionKernels.cu +++ b/cpp/tensorrt_llm/kernels/communicationKernels/allReduceFusionKernels.cu @@ -13,13 +13,16 @@ * See the License for the specific language governing permissions and * limitations under the License. */ +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/envUtils.h" #include "tensorrt_llm/common/reduceKernelUtils.cuh" #include "tensorrt_llm/kernels/communicationKernels/allReduceFusionKernels.h" #include "tensorrt_llm/kernels/quantization.cuh" #include -namespace tensorrt_llm::kernels::ar_fusion +TRTLLM_NAMESPACE_BEGIN + +namespace kernels::ar_fusion { template struct SyncComm @@ -818,4 +821,6 @@ void allreduce_fusion_op(AllReduceFusionParams const& params) DISPATCH_RANKS(16); TLLM_CHECK_WITH_INFO(false, "allreduce_fusion_kernel: unsupported ranks number!"); } -}; // namespace tensorrt_llm::kernels::ar_fusion +}; // namespace kernels::ar_fusion + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/communicationKernels/allReduceFusionKernels.h b/cpp/tensorrt_llm/kernels/communicationKernels/allReduceFusionKernels.h index 52487b25d4..1fc18c415d 100644 --- a/cpp/tensorrt_llm/kernels/communicationKernels/allReduceFusionKernels.h +++ b/cpp/tensorrt_llm/kernels/communicationKernels/allReduceFusionKernels.h @@ -15,16 +15,19 @@ */ #pragma once +#include "tensorrt_llm/common/assert.h" #include #include #include -#include "tensorrt_llm/common/assert.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaUtils.h" #include "tensorrt_llm/kernels/quantization.h" #include "tensorrt_llm/runtime/ipcUtils.h" -namespace tensorrt_llm::kernels::ar_fusion +TRTLLM_NAMESPACE_BEGIN + +namespace kernels::ar_fusion { template struct ElemsPerAccess; @@ -139,4 +142,6 @@ struct AllReduceFusionParams }; void allreduce_fusion_op(AllReduceFusionParams const& params); -} // namespace tensorrt_llm::kernels::ar_fusion +} // namespace kernels::ar_fusion + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/communicationKernels/allReduceWorkspace.cu b/cpp/tensorrt_llm/kernels/communicationKernels/allReduceWorkspace.cu index fc96dcc73f..3c4b4b5049 100644 --- a/cpp/tensorrt_llm/kernels/communicationKernels/allReduceWorkspace.cu +++ b/cpp/tensorrt_llm/kernels/communicationKernels/allReduceWorkspace.cu @@ -13,9 +13,12 @@ * See the License for the specific language governing permissions and * limitations under the License. */ +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/kernels/communicationKernels/allReduceWorkspace.h" -namespace tensorrt_llm::kernels::ar_fusion +TRTLLM_NAMESPACE_BEGIN + +namespace kernels::ar_fusion { __global__ void lamport_initialize_kernel(float* ptr, int size) @@ -94,4 +97,6 @@ void** Workspace::get_workspace() { return reinterpret_cast(m_workspace); } -}; // namespace tensorrt_llm::kernels::ar_fusion +}; // namespace kernels::ar_fusion + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/communicationKernels/allReduceWorkspace.h b/cpp/tensorrt_llm/kernels/communicationKernels/allReduceWorkspace.h index f72f94d296..055d29c3a0 100644 --- a/cpp/tensorrt_llm/kernels/communicationKernels/allReduceWorkspace.h +++ b/cpp/tensorrt_llm/kernels/communicationKernels/allReduceWorkspace.h @@ -16,11 +16,14 @@ #pragma once #include "tensorrt_llm/common/assert.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaUtils.h" #include "tensorrt_llm/kernels/communicationKernels/allReduceFusionKernels.h" #include "tensorrt_llm/runtime/ipcUtils.h" -namespace tensorrt_llm::kernels::ar_fusion +TRTLLM_NAMESPACE_BEGIN + +namespace kernels::ar_fusion { class Workspace @@ -41,4 +44,6 @@ private: }; void lamport_initialize(void* ptr, int bytes, cudaStream_t stream); -} // namespace tensorrt_llm::kernels::ar_fusion +} // namespace kernels::ar_fusion + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/communicationKernels/customLowPrecisionAllReduceKernels.cu b/cpp/tensorrt_llm/kernels/communicationKernels/customLowPrecisionAllReduceKernels.cu index 82c17119e2..f1d5c08bda 100644 --- a/cpp/tensorrt_llm/kernels/communicationKernels/customLowPrecisionAllReduceKernels.cu +++ b/cpp/tensorrt_llm/kernels/communicationKernels/customLowPrecisionAllReduceKernels.cu @@ -14,6 +14,7 @@ * limitations under the License. */ +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaBf16Fallbacks.cuh" #include "tensorrt_llm/common/cudaTypeUtils.cuh" #include "tensorrt_llm/common/cudaUtils.h" @@ -25,7 +26,9 @@ #include #include -namespace tensorrt_llm::kernels +TRTLLM_NAMESPACE_BEGIN + +namespace kernels { using tensorrt_llm::common::divUp; @@ -1632,4 +1635,6 @@ void customLowPrecisionAllReduce( sync_check_cuda_error(stream); } -} // namespace tensorrt_llm::kernels +} // namespace kernels + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/communicationKernels/customLowPrecisionAllReduceKernels.h b/cpp/tensorrt_llm/kernels/communicationKernels/customLowPrecisionAllReduceKernels.h index f4df59fcf2..5fc87ef1a5 100644 --- a/cpp/tensorrt_llm/kernels/communicationKernels/customLowPrecisionAllReduceKernels.h +++ b/cpp/tensorrt_llm/kernels/communicationKernels/customLowPrecisionAllReduceKernels.h @@ -17,6 +17,7 @@ #pragma once #include "tensorrt_llm/common/assert.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaUtils.h" #include "tensorrt_llm/kernels/customAllReduceKernels.h" #include @@ -24,7 +25,9 @@ #include #include -namespace tensorrt_llm::kernels +TRTLLM_NAMESPACE_BEGIN + +namespace kernels { constexpr int LP_ALLREDUCE_MAX_BLOCKS = 8; @@ -119,4 +122,6 @@ void customLowPrecisionAllReduce( kernels::LowPrecisionAllReduceParams& params, nvinfer1::DataType dataType, cudaStream_t stream); int32_t max_workspace_size_lowprecision(int32_t tp_size); -} // namespace tensorrt_llm::kernels +} // namespace kernels + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/communicationKernels/mnnvlAllreduceKernels.cu b/cpp/tensorrt_llm/kernels/communicationKernels/mnnvlAllreduceKernels.cu index 5a0727fcc3..47d4cf3736 100644 --- a/cpp/tensorrt_llm/kernels/communicationKernels/mnnvlAllreduceKernels.cu +++ b/cpp/tensorrt_llm/kernels/communicationKernels/mnnvlAllreduceKernels.cu @@ -14,6 +14,7 @@ * limitations under the License. */ #include "mnnvlAllreduceKernels.h" +#include "tensorrt_llm/common/config.h" #include #include #include @@ -31,7 +32,9 @@ #include "tensorrt_llm/common/logger.h" #include "tensorrt_llm/common/reduceKernelUtils.cuh" -namespace tensorrt_llm::kernels::mnnvl +TRTLLM_NAMESPACE_BEGIN + +namespace kernels::mnnvl { using tensorrt_llm::common::isNegZero; @@ -1029,4 +1032,6 @@ void twoshotAllreduceFusionOp(AllReduceFusionParams const& params) } } -} // namespace tensorrt_llm::kernels::mnnvl +} // namespace kernels::mnnvl + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/communicationKernels/mnnvlAllreduceKernels.h b/cpp/tensorrt_llm/kernels/communicationKernels/mnnvlAllreduceKernels.h index 422b32a702..5361f50221 100644 --- a/cpp/tensorrt_llm/kernels/communicationKernels/mnnvlAllreduceKernels.h +++ b/cpp/tensorrt_llm/kernels/communicationKernels/mnnvlAllreduceKernels.h @@ -16,11 +16,13 @@ #ifndef TRTLLM_MNNVL_ALLREDUCE_KERNELS_H #define TRTLLM_MNNVL_ALLREDUCE_KERNELS_H +#include "tensorrt_llm/common/config.h" #include #include -namespace tensorrt_llm::kernels::mnnvl +TRTLLM_NAMESPACE_BEGIN +namespace kernels::mnnvl { /** @@ -66,6 +68,7 @@ struct AllReduceFusionParams void oneshotAllreduceFusionOp(AllReduceFusionParams const& params); void twoshotAllreduceFusionOp(AllReduceFusionParams const& params); -} // namespace tensorrt_llm::kernels::mnnvl +} // namespace kernels::mnnvl +TRTLLM_NAMESPACE_END #endif // TRTLLM_MNNVL_ALLREDUCE_KERNELS_H diff --git a/cpp/tensorrt_llm/kernels/communicationKernels/moeAllReduceFusionKernels.cu b/cpp/tensorrt_llm/kernels/communicationKernels/moeAllReduceFusionKernels.cu index 7bc9e326fb..44a32f9a1f 100644 --- a/cpp/tensorrt_llm/kernels/communicationKernels/moeAllReduceFusionKernels.cu +++ b/cpp/tensorrt_llm/kernels/communicationKernels/moeAllReduceFusionKernels.cu @@ -13,13 +13,16 @@ * See the License for the specific language governing permissions and * limitations under the License. */ +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/envUtils.h" #include "tensorrt_llm/common/reduceKernelUtils.cuh" #include "tensorrt_llm/kernels/communicationKernels/moeAllReduceFusionKernels.h" #include "tensorrt_llm/kernels/quantization.cuh" #include -namespace tensorrt_llm::kernels::ar_fusion::moe +TRTLLM_NAMESPACE_BEGIN + +namespace kernels::ar_fusion::moe { template struct LamportComm @@ -770,4 +773,6 @@ void moefinalize_allreduce_fusion_op(MoeFinalizeAllReduceFusionParams const& par #undef MOE_FINALIZE_DISPATCH1 } -}; // namespace tensorrt_llm::kernels::ar_fusion::moe +}; // namespace kernels::ar_fusion::moe + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/communicationKernels/moeAllReduceFusionKernels.h b/cpp/tensorrt_llm/kernels/communicationKernels/moeAllReduceFusionKernels.h index 4a35d14bf0..556dd4e5cd 100644 --- a/cpp/tensorrt_llm/kernels/communicationKernels/moeAllReduceFusionKernels.h +++ b/cpp/tensorrt_llm/kernels/communicationKernels/moeAllReduceFusionKernels.h @@ -15,16 +15,19 @@ */ #pragma once +#include "tensorrt_llm/common/assert.h" #include #include #include -#include "tensorrt_llm/common/assert.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaUtils.h" #include "tensorrt_llm/kernels/quantization.h" #include "tensorrt_llm/runtime/ipcUtils.h" -namespace tensorrt_llm::kernels::ar_fusion::moe +TRTLLM_NAMESPACE_BEGIN + +namespace kernels::ar_fusion::moe { static constexpr int kElemsPerAccess = 8; static constexpr int kOneShotMaxToken = 128; @@ -102,4 +105,6 @@ struct MoeFinalizeAllReduceFusionParams : public AllReduceFusionParams void moefinalize_allreduce_fusion_op(MoeFinalizeAllReduceFusionParams const& params); -} // namespace tensorrt_llm::kernels::ar_fusion::moe +} // namespace kernels::ar_fusion::moe + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/communicationKernels/moeAlltoAllKernels.cu b/cpp/tensorrt_llm/kernels/communicationKernels/moeAlltoAllKernels.cu index 62c25ce3ca..1ee535bdbd 100644 --- a/cpp/tensorrt_llm/kernels/communicationKernels/moeAlltoAllKernels.cu +++ b/cpp/tensorrt_llm/kernels/communicationKernels/moeAlltoAllKernels.cu @@ -14,6 +14,7 @@ * limitations under the License. */ #include "tensorrt_llm/common/assert.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaUtils.h" #include "tensorrt_llm/common/envUtils.h" #include "tensorrt_llm/common/vec_dtypes.cuh" @@ -23,7 +24,9 @@ #include #include -namespace tensorrt_llm::kernels::moe_comm +TRTLLM_NAMESPACE_BEGIN + +namespace kernels::moe_comm { #define ENABLE_DEBUG_PRINT 0 @@ -1082,4 +1085,6 @@ void moe_a2a_sanitize_expert_ids_launch(int32_t* expert_ids, int32_t const* recv expert_ids, recv_counters, ep_size, max_tokens_per_rank, top_k, invalid_id); } -} // namespace tensorrt_llm::kernels::moe_comm +} // namespace kernels::moe_comm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/communicationKernels/moeAlltoAllKernels.h b/cpp/tensorrt_llm/kernels/communicationKernels/moeAlltoAllKernels.h index 93e6508253..193a3806df 100644 --- a/cpp/tensorrt_llm/kernels/communicationKernels/moeAlltoAllKernels.h +++ b/cpp/tensorrt_llm/kernels/communicationKernels/moeAlltoAllKernels.h @@ -15,11 +15,14 @@ */ #pragma once +#include "tensorrt_llm/common/config.h" #include #include #include -namespace tensorrt_llm::kernels::moe_comm +TRTLLM_NAMESPACE_BEGIN + +namespace kernels::moe_comm { // Configuration constants @@ -176,4 +179,6 @@ void moe_a2a_prepare_combine_launch(MoeA2ACombineParams const& params); void moe_a2a_sanitize_expert_ids_launch(int32_t* expert_ids, int32_t const* recv_counters, int32_t invalid_id, int ep_size, int max_tokens_per_rank, int top_k, cudaStream_t stream); -} // namespace tensorrt_llm::kernels::moe_comm +} // namespace kernels::moe_comm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/fmhaPackedMask.cu b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/fmhaPackedMask.cu index 03cf00df6d..a80edde888 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/fmhaPackedMask.cu +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/fmhaPackedMask.cu @@ -15,6 +15,7 @@ */ #include "fmhaPackedMask.h" #include "tensorrt_llm/common/assert.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaBf16Wrapper.h" #include "tensorrt_llm/common/cudaFp8Utils.h" #include "tensorrt_llm/common/cudaUtils.h" @@ -24,8 +25,8 @@ using namespace tensorrt_llm::common; -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { @@ -286,4 +287,5 @@ template void invokeBuildPackedMask(PackedMaskParams<__nv_bfloat16> const&, cuda //////////////////////////////////////////////////////////////////////////////////////////////////// } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/fmhaPackedMask.h b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/fmhaPackedMask.h index 4f4c286fee..205aee942f 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/fmhaPackedMask.h +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/fmhaPackedMask.h @@ -15,6 +15,7 @@ */ #pragma once +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaUtils.h" #include "tensorrt_llm/kernels/contextFusedMultiHeadAttention/fused_multihead_attention_common.h" #include "tensorrt_llm/runtime/iTensor.h" @@ -25,8 +26,8 @@ namespace tc = tensorrt_llm::common; -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { @@ -78,4 +79,5 @@ template void invokeBuildPackedMask(PackedMaskParams const& params, cudaStream_t stream); } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/fmhaRunner.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/fmhaRunner.cpp index e92838637a..13749d03e9 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/fmhaRunner.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/fmhaRunner.cpp @@ -15,6 +15,7 @@ */ #include "fmhaRunner.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/envUtils.h" #include "tensorrt_llm/common/mathUtils.h" #include @@ -28,8 +29,8 @@ //////////////////////////////////////////////////////////////////////////////////////////////////// -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { @@ -738,4 +739,5 @@ bool FusedMHARunnerV2::isFmhaSupported() } } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/fmhaRunner.h b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/fmhaRunner.h index afa8eb949a..ab2c82a544 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/fmhaRunner.h +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/fmhaRunner.h @@ -29,11 +29,12 @@ #include "fused_multihead_attention_common.h" #include "fused_multihead_attention_v2.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaUtils.h" #include "tmaDescriptor.h" -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { @@ -102,4 +103,5 @@ private: }; } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/fused_multihead_attention_common.h b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/fused_multihead_attention_common.h index c2c0c48d16..93002edeff 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/fused_multihead_attention_common.h +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/fused_multihead_attention_common.h @@ -16,16 +16,16 @@ #pragma once +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/kernels/kvCacheUtils.h" +#include "tensorrt_llm/kernels/multiHeadAttentionCommon.h" +#include "tensorrt_llm/kernels/sparseAttentionKernels.h" #include "tmaDescriptor.h" #include #include -#include "tensorrt_llm/kernels/multiHeadAttentionCommon.h" -#include "tensorrt_llm/kernels/sparseAttentionKernels.h" +TRTLLM_NAMESPACE_BEGIN -namespace tensorrt_llm -{ namespace kernels { @@ -518,4 +518,5 @@ struct Launch_params }; } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/fused_multihead_attention_v2.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/fused_multihead_attention_v2.cpp index 7af9c4192a..ad133e6603 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/fused_multihead_attention_v2.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/fused_multihead_attention_v2.cpp @@ -15,13 +15,17 @@ */ #include "fused_multihead_attention_v2.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/logger.h" #include #include +#include #include #include -namespace tensorrt_llm::kernels +TRTLLM_NAMESPACE_BEGIN + +namespace kernels { //////////////////////////////////////////////////////////////////////////////////////////////////// @@ -556,7 +560,9 @@ FusedMultiHeadAttentionXMMAKernelV2 const* getXMMAKernelsV2(Data_type inputType, { sm = kSM_120; } - return FusedMHAKernelFactoryV2::Get().getXMMAKernels(sMhaKernelMetaInfosV2, - sizeof(sMhaKernelMetaInfosV2) / sizeof(sMhaKernelMetaInfosV2[0]), inputType, outputType, sm); + return FusedMHAKernelFactoryV2::Get().getXMMAKernels( + sMhaKernelMetaInfosV2, sMhaKernelMetaInfosV2Size, inputType, outputType, sm); } -} // namespace tensorrt_llm::kernels +} // namespace kernels + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/fused_multihead_attention_v2.h b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/fused_multihead_attention_v2.h index 3dc1a6110c..54241f67c9 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/fused_multihead_attention_v2.h +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/fused_multihead_attention_v2.h @@ -21,6 +21,7 @@ #include "cubin/fmha_cubin.h" #include "cuda_runtime_api.h" #include "tensorrt_llm/common/assert.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaDriverWrapper.h" #include "tmaDescriptor.h" @@ -33,7 +34,9 @@ #include #include -namespace tensorrt_llm::kernels +TRTLLM_NAMESPACE_BEGIN + +namespace kernels { //////////////////////////////////////////////////////////////////////////////////////////////////// @@ -153,4 +156,6 @@ using FusedMHAKernelFactoryV2 = TFusedMHAKernelFactory +#include +#include + +TRTLLM_NAMESPACE_BEGIN + namespace kernels { @@ -500,4 +506,5 @@ private: }; } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/cumsumLastDim.cu b/cpp/tensorrt_llm/kernels/cumsumLastDim.cu index 8989e95fcf..100635c68f 100644 --- a/cpp/tensorrt_llm/kernels/cumsumLastDim.cu +++ b/cpp/tensorrt_llm/kernels/cumsumLastDim.cu @@ -14,14 +14,15 @@ * limitations under the License. */ +#include "tensorrt_llm/common/config.h" #include #include "cumsumLastDim.h" #include -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { @@ -170,4 +171,5 @@ INSTANTIATE_CUMSUM_LastDim_DATA_TYPE(__nv_bfloat16); #undef INSTANTIATE_CUMSUM_LastDim_DATA_TYPE } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/cumsumLastDim.h b/cpp/tensorrt_llm/kernels/cumsumLastDim.h index 2266f685eb..7045ec3c19 100644 --- a/cpp/tensorrt_llm/kernels/cumsumLastDim.h +++ b/cpp/tensorrt_llm/kernels/cumsumLastDim.h @@ -17,11 +17,12 @@ #pragma once #include "tensorrt_llm/common/assert.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaUtils.h" #include "tensorrt_llm/runtime/common.h" -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { using SizeType32 = tensorrt_llm::runtime::SizeType32; @@ -34,4 +35,5 @@ void invokeCumsumLastDim(SizeType32 batchSize, SizeType32 inputLength, void cons void* __restrict__ output, void* workspace, size_t tempStorageBytes, cudaStream_t stream); } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/customAllReduceKernels.cu b/cpp/tensorrt_llm/kernels/customAllReduceKernels.cu index 39911eac61..d5633b2cce 100644 --- a/cpp/tensorrt_llm/kernels/customAllReduceKernels.cu +++ b/cpp/tensorrt_llm/kernels/customAllReduceKernels.cu @@ -15,6 +15,7 @@ */ #include "customAllReduceKernels.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaBf16Fallbacks.cuh" #include "tensorrt_llm/common/cudaTypeUtils.cuh" #include "tensorrt_llm/common/cudaUtils.h" @@ -26,7 +27,9 @@ #include #include -namespace tensorrt_llm::kernels +TRTLLM_NAMESPACE_BEGIN + +namespace kernels { using tensorrt_llm::common::divUp; @@ -2014,4 +2017,6 @@ void lamportInitialize(void* buffer, size_t size, nvinfer1::DataType dataType, c sync_check_cuda_error(stream); } -} // namespace tensorrt_llm::kernels +} // namespace kernels + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/customAllReduceKernels.h b/cpp/tensorrt_llm/kernels/customAllReduceKernels.h index c96a1b3064..06b5a281fb 100644 --- a/cpp/tensorrt_llm/kernels/customAllReduceKernels.h +++ b/cpp/tensorrt_llm/kernels/customAllReduceKernels.h @@ -16,15 +16,18 @@ #pragma once +#include "tensorrt_llm/common/assert.h" #include #include #include #include -#include "tensorrt_llm/common/assert.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaUtils.h" -namespace tensorrt_llm::kernels +TRTLLM_NAMESPACE_BEGIN + +namespace kernels { constexpr size_t WARP_SIZE = 32; @@ -192,4 +195,6 @@ namespace reduce_fusion bool is_lamport_supported(nvinfer1::DataType dataType, int token_num, int hidden_size); } -} // namespace tensorrt_llm::kernels +} // namespace kernels + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/customMoeRoutingKernels.cu b/cpp/tensorrt_llm/kernels/customMoeRoutingKernels.cu index 59f3a67f13..a767cfccda 100644 --- a/cpp/tensorrt_llm/kernels/customMoeRoutingKernels.cu +++ b/cpp/tensorrt_llm/kernels/customMoeRoutingKernels.cu @@ -15,6 +15,7 @@ */ #include "moeTopKFuncs.cuh" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaTypeUtils.cuh" #include "tensorrt_llm/common/envUtils.h" #include "tensorrt_llm/kernels/archCondition.h" @@ -29,7 +30,9 @@ namespace cg = cooperative_groups; using namespace tensorrt_llm::common; -namespace tensorrt_llm::kernels +TRTLLM_NAMESPACE_BEGIN + +namespace kernels { static constexpr int BLOCK_SIZE = 1024; @@ -284,4 +287,6 @@ INSTANTIATE_RENORM_MOE_ROUTING(half, __nv_bfloat16, int32_t, true); INSTANTIATE_RENORM_MOE_ROUTING(__nv_bfloat16, __nv_bfloat16, int32_t, true); #endif -} // namespace tensorrt_llm::kernels +} // namespace kernels + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/customMoeRoutingKernels.h b/cpp/tensorrt_llm/kernels/customMoeRoutingKernels.h index 500889c0e5..f8240b4363 100644 --- a/cpp/tensorrt_llm/kernels/customMoeRoutingKernels.h +++ b/cpp/tensorrt_llm/kernels/customMoeRoutingKernels.h @@ -16,14 +16,19 @@ #pragma once +#include "tensorrt_llm/common/config.h" #include #include #include "tensorrt_llm/common/cudaUtils.h" -namespace tensorrt_llm::kernels +TRTLLM_NAMESPACE_BEGIN + +namespace kernels { template void invokeCustomMoeRouting(InputT* routerLogits, OutputT* topkValues, IdxT* topkIndices, int64_t const numTokens, int64_t const numExperts, int64_t const topK, cudaStream_t const stream); -} // namespace tensorrt_llm::kernels +} // namespace kernels + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/cuteDslKernels/moeUtils.cu b/cpp/tensorrt_llm/kernels/cuteDslKernels/moeUtils.cu index 3fa5fae3af..27958a8671 100644 --- a/cpp/tensorrt_llm/kernels/cuteDslKernels/moeUtils.cu +++ b/cpp/tensorrt_llm/kernels/cuteDslKernels/moeUtils.cu @@ -15,6 +15,7 @@ */ #include "tensorrt_llm/common/assert.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaUtils.h" #include "tensorrt_llm/common/envUtils.h" #include "tensorrt_llm/kernels/cuteDslKernels/moeUtils.h" @@ -25,7 +26,9 @@ #include #include -namespace tensorrt_llm::kernels::cute_dsl +TRTLLM_NAMESPACE_BEGIN + +namespace kernels::cute_dsl { namespace { @@ -557,4 +560,6 @@ INSTANTIATE_MOE_ACTIVATION(__nv_bfloat16, __nv_fp4_e2m1, uint8_t); #endif #undef INSTANTIATE_MOE_ACTIVATION -} // namespace tensorrt_llm::kernels::cute_dsl +} // namespace kernels::cute_dsl + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/cuteDslKernels/moeUtils.h b/cpp/tensorrt_llm/kernels/cuteDslKernels/moeUtils.h index 2bd356e3b0..fb84769fd9 100644 --- a/cpp/tensorrt_llm/kernels/cuteDslKernels/moeUtils.h +++ b/cpp/tensorrt_llm/kernels/cuteDslKernels/moeUtils.h @@ -15,11 +15,14 @@ */ #pragma once +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/kernels/cutlass_kernels/include/moe_kernels.h" #include #include -namespace tensorrt_llm::kernels::cute_dsl +TRTLLM_NAMESPACE_BEGIN + +namespace kernels::cute_dsl { template void moePermute(InputType const* input, InputType* permuted_output, SFType const* input_sf, SFType* permuted_sf, @@ -44,4 +47,6 @@ void moeActivation(InputType const* input, OutputType* output, float const* glob cutlass_kernels::ActivationParams activation_params, int32_t const max_num_permuted_tokens, int32_t const interm_size, int32_t const tile_size, cudaStream_t stream); -} // namespace tensorrt_llm::kernels::cute_dsl +} // namespace kernels::cute_dsl + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/allreduce_gemm/allreduce_gemm_impl_sm100.h b/cpp/tensorrt_llm/kernels/cutlass_kernels/allreduce_gemm/allreduce_gemm_impl_sm100.h index a4be82607a..8ea96d0b6a 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/allreduce_gemm/allreduce_gemm_impl_sm100.h +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/allreduce_gemm/allreduce_gemm_impl_sm100.h @@ -29,12 +29,15 @@ #include "cutlass/gemm/device/gemm_universal_adapter.h" #include "cutlass/gemm/kernel/tile_scheduler.hpp" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaUtils.h" #include "tensorrt_llm/runtime/ipcNvlsMemory.h" using namespace tensorrt_llm::runtime; -namespace tensorrt_llm::kernels::opened_cutlass_kernels +TRTLLM_NAMESPACE_BEGIN + +namespace kernels::opened_cutlass_kernels { ////////////////////////////////////////////// // Sm100 Two-shot fusion @@ -374,4 +377,6 @@ private: cutlass::KernelHardwareInfo _hw_info; }; -} // namespace tensorrt_llm::kernels::opened_cutlass_kernels +} // namespace kernels::opened_cutlass_kernels + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/allreduce_gemm/allreduce_gemm_impl_sm90.h b/cpp/tensorrt_llm/kernels/cutlass_kernels/allreduce_gemm/allreduce_gemm_impl_sm90.h index fb446b451d..97bfea0f79 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/allreduce_gemm/allreduce_gemm_impl_sm90.h +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/allreduce_gemm/allreduce_gemm_impl_sm90.h @@ -37,12 +37,15 @@ #include "./epilogue/sm90_visitor_allreduce_tma_warpspecialized.hpp" #include "./kernel/sm90_gemm_allreduce_tma_warpspecialized_pingpong.hpp" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaUtils.h" #include "tensorrt_llm/runtime/ipcNvlsMemory.h" using namespace tensorrt_llm::runtime; -namespace tensorrt_llm::kernels::opened_cutlass_kernels +TRTLLM_NAMESPACE_BEGIN + +namespace kernels::opened_cutlass_kernels { ////////////////////////////////////////////// // Sm90 Two-shot fusion @@ -322,4 +325,6 @@ private: cutlass::KernelHardwareInfo _hw_info; }; -} // namespace tensorrt_llm::kernels::opened_cutlass_kernels +} // namespace kernels::opened_cutlass_kernels + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/allreduce_gemm/allreduce_gemm_runner.cu b/cpp/tensorrt_llm/kernels/cutlass_kernels/allreduce_gemm/allreduce_gemm_runner.cu index 33f6c61882..2bca57c229 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/allreduce_gemm/allreduce_gemm_runner.cu +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/allreduce_gemm/allreduce_gemm_runner.cu @@ -15,13 +15,17 @@ */ #include "./allreduce_gemm_impl_sm100.h" #include "./allreduce_gemm_impl_sm90.h" + +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaUtils.h" #include "cutlass/bfloat16.h" #include "cutlass/float8.h" #include "cutlass/half.h" -namespace tensorrt_llm::kernels::opened_cutlass_kernels +TRTLLM_NAMESPACE_BEGIN + +namespace kernels::opened_cutlass_kernels { ///////////////////////////////////////////////// // GemmAllReduce implementation specializations @@ -292,4 +296,6 @@ template class GemmAllReduceImplRunner>; -} // namespace tensorrt_llm::kernels::opened_cutlass_kernels +} // namespace kernels::opened_cutlass_kernels + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/cutlass_heuristic.cpp b/cpp/tensorrt_llm/kernels/cutlass_kernels/cutlass_heuristic.cpp index 1283d8936e..028effc68f 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/cutlass_heuristic.cpp +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/cutlass_heuristic.cpp @@ -15,6 +15,7 @@ */ #include "tensorrt_llm/kernels/cutlass_kernels/cutlass_heuristic.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaBf16Wrapper.h" #ifdef __GNUC__ // Check if the compiler is GCC or Clang @@ -36,8 +37,8 @@ using namespace tensorrt_llm::cutlass_extensions; -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { namespace cutlass_kernels @@ -693,4 +694,5 @@ CutlassGemmConfig estimate_best_config_from_occupancies(std::vector( } // namespace cutlass_kernels } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/cutlass_preprocessors.h b/cpp/tensorrt_llm/kernels/cutlass_kernels/cutlass_preprocessors.h index b12fd73724..f18b630767 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/cutlass_preprocessors.h +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/cutlass_preprocessors.h @@ -16,14 +16,15 @@ #pragma once +#include "tensorrt_llm/common/config.h" #include #include #include #include "tensorrt_llm/common/cudaUtils.h" -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { namespace cutlass_kernels @@ -73,4 +74,5 @@ void symmetric_quantize(int8_t* processed_quantized_weight, int8_t* unprocessed_ } // namespace cutlass_kernels } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/cutlass_type_conversion.h b/cpp/tensorrt_llm/kernels/cutlass_kernels/cutlass_type_conversion.h index 411013aa26..dbbed4e08c 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/cutlass_type_conversion.h +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/cutlass_type_conversion.h @@ -16,6 +16,7 @@ #pragma once +#include "tensorrt_llm/common/config.h" #include #include "cutlass/half.h" @@ -30,8 +31,8 @@ #include "cutlass/float_subbyte.h" #include -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { namespace cutlass_kernels @@ -163,4 +164,5 @@ struct CutlassToTllmTypeAdapter } // namespace cutlass_kernels } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/fp4_gemm/fp4_gemm_bf16.cu b/cpp/tensorrt_llm/kernels/cutlass_kernels/fp4_gemm/fp4_gemm_bf16.cu index cbf33a9ce5..f4f4e40c01 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/fp4_gemm/fp4_gemm_bf16.cu +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/fp4_gemm/fp4_gemm_bf16.cu @@ -15,9 +15,10 @@ */ #include "fp4_gemm_template.h" +#include "tensorrt_llm/common/config.h" + +TRTLLM_NAMESPACE_BEGIN -namespace tensorrt_llm -{ namespace kernels { namespace cutlass_kernels @@ -84,4 +85,5 @@ template class CutlassFp4GemmRunner<__nv_bfloat16, FP4GemmType::W4A8_MXFP4_MXFP8 } // namespace cutlass_kernels } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/fp4_gemm/fp4_gemm_fp16.cu b/cpp/tensorrt_llm/kernels/cutlass_kernels/fp4_gemm/fp4_gemm_fp16.cu index 0b232fb95b..71453157a5 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/fp4_gemm/fp4_gemm_fp16.cu +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/fp4_gemm/fp4_gemm_fp16.cu @@ -15,9 +15,10 @@ */ #include "fp4_gemm_template.h" +#include "tensorrt_llm/common/config.h" + +TRTLLM_NAMESPACE_BEGIN -namespace tensorrt_llm -{ namespace kernels { namespace cutlass_kernels @@ -81,4 +82,5 @@ template class CutlassFp4GemmRunner; } // namespace cutlass_kernels } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/fp4_gemm/fp4_gemm_fp32.cu b/cpp/tensorrt_llm/kernels/cutlass_kernels/fp4_gemm/fp4_gemm_fp32.cu index d733c97f6b..e187080938 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/fp4_gemm/fp4_gemm_fp32.cu +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/fp4_gemm/fp4_gemm_fp32.cu @@ -15,9 +15,10 @@ */ #include "fp4_gemm_template.h" +#include "tensorrt_llm/common/config.h" + +TRTLLM_NAMESPACE_BEGIN -namespace tensorrt_llm -{ namespace kernels { namespace cutlass_kernels @@ -81,4 +82,5 @@ template class CutlassFp4GemmRunner; } // namespace cutlass_kernels } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/fp4_gemm/fp4_gemm_template.h b/cpp/tensorrt_llm/kernels/cutlass_kernels/fp4_gemm/fp4_gemm_template.h index 25cd88b478..003dcb9bb3 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/fp4_gemm/fp4_gemm_template.h +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/fp4_gemm/fp4_gemm_template.h @@ -39,11 +39,13 @@ #include "mxfp8_mxfp4_gemm_template_sm100.h" #include "nvfp4_nvfp4_gemm_template_sm100.h" #include "nvfp4_nvfp4_gemm_template_sm120.h" + +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaUtils.h" #include "tensorrt_llm/common/logger.h" -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { namespace cutlass_kernels @@ -527,4 +529,5 @@ size_t CutlassFp4GemmRunner::getWorkspaceSize( } // namespace cutlass_kernels } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/fp4_gemm/mxfp8_mxfp4_gemm_template_sm100.h b/cpp/tensorrt_llm/kernels/cutlass_kernels/fp4_gemm/mxfp8_mxfp4_gemm_template_sm100.h index 4191b337fe..3970563bc1 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/fp4_gemm/mxfp8_mxfp4_gemm_template_sm100.h +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/fp4_gemm/mxfp8_mxfp4_gemm_template_sm100.h @@ -29,6 +29,7 @@ #include "cutlass/gemm/collective/collective_builder.hpp" #include "cutlass/gemm/gemm.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaUtils.h" #include "tensorrt_llm/common/envUtils.h" #include "tensorrt_llm/kernels/archCondition.h" @@ -41,8 +42,8 @@ using namespace cute; using namespace tensorrt_llm::kernels::cutlass_kernels; -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { namespace cutlass_kernels @@ -291,4 +292,5 @@ size_t genericMXFP8xMXFP4GemmKernelLauncher(void* D, void const* A, void const* } // namespace cutlass_kernels } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/fp4_gemm/nvfp4_nvfp4_gemm_template_sm100.h b/cpp/tensorrt_llm/kernels/cutlass_kernels/fp4_gemm/nvfp4_nvfp4_gemm_template_sm100.h index 720e62064d..277a16aa1b 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/fp4_gemm/nvfp4_nvfp4_gemm_template_sm100.h +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/fp4_gemm/nvfp4_nvfp4_gemm_template_sm100.h @@ -29,17 +29,17 @@ #include "cutlass/gemm/collective/collective_builder.hpp" #include "cutlass/gemm/gemm.h" +#include "tensorrt_llm/common/config.h" +#include "tensorrt_llm/common/envUtils.h" #include "tensorrt_llm/kernels/archCondition.h" #include "tensorrt_llm/kernels/cutlass_kernels/cutlass_type_conversion.h" -#include "tensorrt_llm/common/envUtils.h" - #ifndef _WIN32 #pragma GCC diagnostic pop #endif // #ifndef _WIN32 -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { namespace cutlass_kernels @@ -329,4 +329,5 @@ size_t genericFp4GemmKernelLauncher(void* D, void const* A, void const* B, void } // namespace cutlass_kernels } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/fp4_gemm/nvfp4_nvfp4_gemm_template_sm120.h b/cpp/tensorrt_llm/kernels/cutlass_kernels/fp4_gemm/nvfp4_nvfp4_gemm_template_sm120.h index d9eeda8476..eaa3378acb 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/fp4_gemm/nvfp4_nvfp4_gemm_template_sm120.h +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/fp4_gemm/nvfp4_nvfp4_gemm_template_sm120.h @@ -30,17 +30,17 @@ #include "cutlass/gemm/gemm.h" #include "cutlass/util/packed_stride.hpp" -#include "tensorrt_llm/kernels/cutlass_kernels/cutlass_type_conversion.h" - +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/envUtils.h" #include "tensorrt_llm/common/logger.h" +#include "tensorrt_llm/kernels/cutlass_kernels/cutlass_type_conversion.h" #ifndef _WIN32 #pragma GCC diagnostic pop #endif // #ifndef _WIN32 -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { namespace cutlass_kernels @@ -259,4 +259,5 @@ size_t genericFp4GemmKernelLauncherSm120(void* D, void const* A, void const* B, } // namespace cutlass_kernels } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/fp8_blockscale_gemm/fp8_blockscale_gemm.cu b/cpp/tensorrt_llm/kernels/cutlass_kernels/fp8_blockscale_gemm/fp8_blockscale_gemm.cu index d234ef8b75..e8552e21f0 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/fp8_blockscale_gemm/fp8_blockscale_gemm.cu +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/fp8_blockscale_gemm/fp8_blockscale_gemm.cu @@ -16,9 +16,12 @@ #include "fp8_blockscale_gemm.h" #include "fp8_blockscale_gemm_kernel.cuh" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/logger.h" -namespace tensorrt_llm::kernels::fp8_blockscale_gemm +TRTLLM_NAMESPACE_BEGIN + +namespace kernels::fp8_blockscale_gemm { template @@ -310,4 +313,6 @@ template class CutlassFp8BlockScaleGemmRunner<__nv_bfloat16, __nv_fp8_e4m3, __nv template class CutlassFp8BlockScaleGemmRunner<__nv_fp8_e4m3, __nv_bfloat16, __nv_bfloat16>; template class CutlassFp8BlockScaleGemmRunner<__nv_fp8_e4m3, __nv_fp8_e4m3, __nv_bfloat16>; -} // namespace tensorrt_llm::kernels::fp8_blockscale_gemm +} // namespace kernels::fp8_blockscale_gemm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/fp8_blockscale_gemm/fp8_blockscale_gemm.h b/cpp/tensorrt_llm/kernels/cutlass_kernels/fp8_blockscale_gemm/fp8_blockscale_gemm.h index 29a954ac11..b178c1a1b8 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/fp8_blockscale_gemm/fp8_blockscale_gemm.h +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/fp8_blockscale_gemm/fp8_blockscale_gemm.h @@ -15,13 +15,18 @@ */ #pragma once + +#include "tensorrt_llm/common/config.h" + #include #include #include #include // non-persistent-cooperative GEMM -namespace tensorrt_llm::kernels::fp8_blockscale_gemm +TRTLLM_NAMESPACE_BEGIN + +namespace kernels::fp8_blockscale_gemm { class CutlassFp8BlockScaleGemmRunnerInterface @@ -146,4 +151,6 @@ private: int64_t expected_m_ = 0; }; -} // namespace tensorrt_llm::kernels::fp8_blockscale_gemm +} // namespace kernels::fp8_blockscale_gemm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/fp8_blockscale_gemm/fp8_blockscale_gemm_kernel.cuh b/cpp/tensorrt_llm/kernels/cutlass_kernels/fp8_blockscale_gemm/fp8_blockscale_gemm_kernel.cuh index 7f95456fb0..e50f2915f2 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/fp8_blockscale_gemm/fp8_blockscale_gemm_kernel.cuh +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/fp8_blockscale_gemm/fp8_blockscale_gemm_kernel.cuh @@ -31,10 +31,13 @@ #include "ada_blockwise_gemm/sm89_fp8_gemm_1d1d.cuh" #include "fp8_blockscale_mma_utils.cuh" #include "fp8_blockscale_tma_utils.cuh" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaUtils.h" #include "tensorrt_llm/common/logger.h" #include "tensorrt_llm/deep_gemm/fp8_gemm.cuh" +TRTLLM_NAMESPACE_BEGIN + namespace kernel_utils { @@ -154,7 +157,7 @@ __inline__ __device__ uint32_t elect_one_sync([[maybe_unused]] int lane_id) } // namespace kernel_utils -namespace tensorrt_llm::kernels::fp8_blockscale_gemm +namespace kernels::fp8_blockscale_gemm { template @@ -1960,4 +1963,6 @@ void fp8_stride_batch_gemm_run(__nv_bfloat16 const* mat_a, __nv_fp8_e4m3* fp8_ma } } -} // namespace tensorrt_llm::kernels::fp8_blockscale_gemm +} // namespace kernels::fp8_blockscale_gemm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/fp8_blockscale_gemm/fp8_blockscale_mma_utils.cuh b/cpp/tensorrt_llm/kernels/cutlass_kernels/fp8_blockscale_gemm/fp8_blockscale_mma_utils.cuh index 3282f2750c..9b7e9ceb4f 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/fp8_blockscale_gemm/fp8_blockscale_mma_utils.cuh +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/fp8_blockscale_gemm/fp8_blockscale_mma_utils.cuh @@ -15,10 +15,15 @@ */ #pragma once + +#include "tensorrt_llm/common/config.h" + #include #include -namespace tensorrt_llm::kernels::fp8_blockscale_gemm +TRTLLM_NAMESPACE_BEGIN + +namespace kernels::fp8_blockscale_gemm { struct SM90_64x16x32_F32E4M3E4M3_SS @@ -610,4 +615,6 @@ struct Fp8MmaSelector using Type = decltype(select_type()); }; -} // namespace tensorrt_llm::kernels::fp8_blockscale_gemm +} // namespace kernels::fp8_blockscale_gemm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/fp8_blockscale_gemm/fp8_blockscale_tma_utils.cuh b/cpp/tensorrt_llm/kernels/cutlass_kernels/fp8_blockscale_gemm/fp8_blockscale_tma_utils.cuh index 06cff88ad6..a256c09b4a 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/fp8_blockscale_gemm/fp8_blockscale_tma_utils.cuh +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/fp8_blockscale_gemm/fp8_blockscale_tma_utils.cuh @@ -15,6 +15,9 @@ */ #pragma once + +#include "tensorrt_llm/common/config.h" + #include #include #include @@ -24,7 +27,9 @@ #include #include -namespace tensorrt_llm::kernels::fp8_blockscale_gemm +TRTLLM_NAMESPACE_BEGIN + +namespace kernels::fp8_blockscale_gemm { template @@ -138,4 +143,6 @@ __device__ uint64_t mbarrier_arrive_1_expect_tx_cta(void* smem_ptr, uint32_t tx_ return state; } -} // namespace tensorrt_llm::kernels::fp8_blockscale_gemm +} // namespace kernels::fp8_blockscale_gemm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/fp8_rowwise_gemm/fp8_rowwise_gemm.h b/cpp/tensorrt_llm/kernels/cutlass_kernels/fp8_rowwise_gemm/fp8_rowwise_gemm.h index 7d0816e2eb..3ffe0d317a 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/fp8_rowwise_gemm/fp8_rowwise_gemm.h +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/fp8_rowwise_gemm/fp8_rowwise_gemm.h @@ -17,6 +17,7 @@ #pragma once #include "cutlass_extensions/gemm_configs.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/quantization.h" #include @@ -25,8 +26,8 @@ namespace tk = tensorrt_llm::common; namespace tkc = tensorrt_llm::cutlass_extensions; -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { namespace cutlass_kernels @@ -85,4 +86,5 @@ private: } // namespace cutlass_kernels } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/fp8_rowwise_gemm/fp8_rowwise_gemm_bf16.cu b/cpp/tensorrt_llm/kernels/cutlass_kernels/fp8_rowwise_gemm/fp8_rowwise_gemm_bf16.cu index a1fcb7a5f6..25064c93c5 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/fp8_rowwise_gemm/fp8_rowwise_gemm_bf16.cu +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/fp8_rowwise_gemm/fp8_rowwise_gemm_bf16.cu @@ -15,9 +15,10 @@ */ #include "fp8_rowwise_gemm_template.h" +#include "tensorrt_llm/common/config.h" + +TRTLLM_NAMESPACE_BEGIN -namespace tensorrt_llm -{ namespace kernels { namespace cutlass_kernels @@ -27,4 +28,5 @@ template class CutlassFp8RowwiseGemmRunner<__nv_bfloat16>; #endif } // namespace cutlass_kernels } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/fp8_rowwise_gemm/fp8_rowwise_gemm_fp16.cu b/cpp/tensorrt_llm/kernels/cutlass_kernels/fp8_rowwise_gemm/fp8_rowwise_gemm_fp16.cu index 83582db603..6f9623c39d 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/fp8_rowwise_gemm/fp8_rowwise_gemm_fp16.cu +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/fp8_rowwise_gemm/fp8_rowwise_gemm_fp16.cu @@ -15,9 +15,10 @@ */ #include "fp8_rowwise_gemm_template.h" +#include "tensorrt_llm/common/config.h" + +TRTLLM_NAMESPACE_BEGIN -namespace tensorrt_llm -{ namespace kernels { namespace cutlass_kernels @@ -27,4 +28,5 @@ template class CutlassFp8RowwiseGemmRunner; } // namespace cutlass_kernels } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/fp8_rowwise_gemm/fp8_rowwise_gemm_kernel_template_sm100.h b/cpp/tensorrt_llm/kernels/cutlass_kernels/fp8_rowwise_gemm/fp8_rowwise_gemm_kernel_template_sm100.h index f41637d4ed..68a4066a4f 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/fp8_rowwise_gemm/fp8_rowwise_gemm_kernel_template_sm100.h +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/fp8_rowwise_gemm/fp8_rowwise_gemm_kernel_template_sm100.h @@ -20,6 +20,7 @@ #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wstrict-aliasing" #endif // __GNUC__ +#include "tensorrt_llm/common/config.h" #include "cute/tensor.hpp" #include "cutlass/conv/convolution.h" @@ -43,7 +44,9 @@ #pragma GCC diagnostic pop #endif // __GNUC__ -namespace tensorrt_llm::kernels::cutlass_kernels +TRTLLM_NAMESPACE_BEGIN + +namespace kernels::cutlass_kernels { using namespace cute; @@ -177,4 +180,6 @@ struct DeviceGemmFp8RowwiseSm100 using Gemm = typename cutlass::gemm::device::GemmUniversalAdapter; }; -} // namespace tensorrt_llm::kernels::cutlass_kernels +} // namespace kernels::cutlass_kernels + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/fp8_rowwise_gemm/fp8_rowwise_gemm_kernel_template_sm89.h b/cpp/tensorrt_llm/kernels/cutlass_kernels/fp8_rowwise_gemm/fp8_rowwise_gemm_kernel_template_sm89.h index ea94e6a9b2..468a528cff 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/fp8_rowwise_gemm/fp8_rowwise_gemm_kernel_template_sm89.h +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/fp8_rowwise_gemm/fp8_rowwise_gemm_kernel_template_sm89.h @@ -20,6 +20,7 @@ #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wstrict-aliasing" #endif // __GNUC__ +#include "tensorrt_llm/common/config.h" // clang-format off #include "cutlass/cutlass.h" @@ -35,8 +36,8 @@ #pragma GCC diagnostic pop #endif // __GNUC__ -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { namespace cutlass_kernels @@ -132,4 +133,5 @@ struct DeviceGemmFp8RowwiseSm89 } // namespace cutlass_kernels } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/fp8_rowwise_gemm/fp8_rowwise_gemm_kernel_template_sm90.h b/cpp/tensorrt_llm/kernels/cutlass_kernels/fp8_rowwise_gemm/fp8_rowwise_gemm_kernel_template_sm90.h index 7852e36f3f..4939879761 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/fp8_rowwise_gemm/fp8_rowwise_gemm_kernel_template_sm90.h +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/fp8_rowwise_gemm/fp8_rowwise_gemm_kernel_template_sm90.h @@ -26,6 +26,7 @@ #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wstrict-aliasing" #endif // __GNUC__ +#include "tensorrt_llm/common/config.h" #include "cute/tensor.hpp" #include "cutlass/conv/convolution.h" @@ -49,8 +50,8 @@ #pragma GCC diagnostic pop #endif // __GNUC__ -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { namespace cutlass_kernels @@ -196,4 +197,5 @@ struct DeviceGemmFp8RowwiseSm90 } // namespace cutlass_kernels } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/fp8_rowwise_gemm/fp8_rowwise_gemm_template.h b/cpp/tensorrt_llm/kernels/cutlass_kernels/fp8_rowwise_gemm/fp8_rowwise_gemm_template.h index 3c095421ba..0d601060ee 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/fp8_rowwise_gemm/fp8_rowwise_gemm_template.h +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/fp8_rowwise_gemm/fp8_rowwise_gemm_template.h @@ -26,6 +26,7 @@ #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wstrict-aliasing" #endif // __GNUC__ +#include "tensorrt_llm/common/config.h" #include "cute/tensor.hpp" #include "cutlass/conv/convolution.h" @@ -49,8 +50,8 @@ #include #include -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { namespace cutlass_kernels @@ -865,4 +866,5 @@ size_t CutlassFp8RowwiseGemmRunner::getWorkspaceSize(int const m, int const n } // namespace cutlass_kernels } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/bf16_int4_gemm_fg_scalebias.cu b/cpp/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/bf16_int4_gemm_fg_scalebias.cu index e4783fdefd..d3e1a79b35 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/bf16_int4_gemm_fg_scalebias.cu +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/bf16_int4_gemm_fg_scalebias.cu @@ -14,10 +14,11 @@ * limitations under the License. */ +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm_template.h" -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { namespace cutlass_kernels @@ -28,4 +29,5 @@ template class CutlassFpAIntBGemmRunner<__nv_bfloat16, cutlass::uint4b_t, #endif } // namespace cutlass_kernels } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/bf16_int4_gemm_fg_scaleonly.cu b/cpp/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/bf16_int4_gemm_fg_scaleonly.cu index 8934a2c0df..c3cbcf6ab6 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/bf16_int4_gemm_fg_scaleonly.cu +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/bf16_int4_gemm_fg_scaleonly.cu @@ -14,10 +14,11 @@ * limitations under the License. */ +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm_template.h" -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { namespace cutlass_kernels @@ -28,4 +29,5 @@ template class CutlassFpAIntBGemmRunner<__nv_bfloat16, cutlass::uint4b_t, #endif } // namespace cutlass_kernels } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/bf16_int4_gemm_per_col.cu b/cpp/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/bf16_int4_gemm_per_col.cu index b3fa996a87..12c95f73ee 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/bf16_int4_gemm_per_col.cu +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/bf16_int4_gemm_per_col.cu @@ -14,10 +14,11 @@ * limitations under the License. */ +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm_template.h" -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { namespace cutlass_kernels @@ -28,4 +29,5 @@ template class CutlassFpAIntBGemmRunner<__nv_bfloat16, cutlass::uint4b_t, #endif } // namespace cutlass_kernels } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/bf16_int8_gemm_fg_scalebias.cu b/cpp/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/bf16_int8_gemm_fg_scalebias.cu index 064e4dbde9..dbcc199193 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/bf16_int8_gemm_fg_scalebias.cu +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/bf16_int8_gemm_fg_scalebias.cu @@ -14,10 +14,11 @@ * limitations under the License. */ +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm_template.h" -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { namespace cutlass_kernels @@ -28,4 +29,5 @@ template class CutlassFpAIntBGemmRunner<__nv_bfloat16, uint8_t, #endif } // namespace cutlass_kernels } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/bf16_int8_gemm_fg_scaleonly.cu b/cpp/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/bf16_int8_gemm_fg_scaleonly.cu index 0dbdfabe0a..e87751fbad 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/bf16_int8_gemm_fg_scaleonly.cu +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/bf16_int8_gemm_fg_scaleonly.cu @@ -14,10 +14,11 @@ * limitations under the License. */ +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm_template.h" -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { namespace cutlass_kernels @@ -27,4 +28,5 @@ template class CutlassFpAIntBGemmRunner<__nv_bfloat16, uint8_t, cutlass::WeightO #endif } // namespace cutlass_kernels } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/bf16_int8_gemm_per_col.cu b/cpp/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/bf16_int8_gemm_per_col.cu index 6701d0637e..5d8b9a37c7 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/bf16_int8_gemm_per_col.cu +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/bf16_int8_gemm_per_col.cu @@ -14,10 +14,11 @@ * limitations under the License. */ +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm_template.h" -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { namespace cutlass_kernels @@ -27,4 +28,5 @@ template class CutlassFpAIntBGemmRunner<__nv_bfloat16, uint8_t, cutlass::WeightO #endif } // namespace cutlass_kernels } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/e4m3_int4_gemm_fg_scalebias_bf16_out_bf16.cu b/cpp/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/e4m3_int4_gemm_fg_scalebias_bf16_out_bf16.cu index ce57833187..dced9c13ba 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/e4m3_int4_gemm_fg_scalebias_bf16_out_bf16.cu +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/e4m3_int4_gemm_fg_scalebias_bf16_out_bf16.cu @@ -14,10 +14,11 @@ * limitations under the License. */ +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm_template.h" -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { namespace cutlass_kernels @@ -32,4 +33,5 @@ template class CutlassFpAIntBGemmRunner<__nv_fp8_e4m3, /*Activation #endif } // namespace cutlass_kernels } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/e4m3_int4_gemm_fg_scalebias_f16_out_f16.cu b/cpp/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/e4m3_int4_gemm_fg_scalebias_f16_out_f16.cu index 7cef1a1272..9de8362de0 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/e4m3_int4_gemm_fg_scalebias_f16_out_f16.cu +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/e4m3_int4_gemm_fg_scalebias_f16_out_f16.cu @@ -14,10 +14,11 @@ * limitations under the License. */ +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm_template.h" -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { namespace cutlass_kernels @@ -32,4 +33,5 @@ template class CutlassFpAIntBGemmRunner<__nv_fp8_e4m3, /*Activation #endif } // namespace cutlass_kernels } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/e4m3_int4_gemm_fg_scaleonly_bf16_out_bf16.cu b/cpp/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/e4m3_int4_gemm_fg_scaleonly_bf16_out_bf16.cu index 66644fcfde..4ce228abc0 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/e4m3_int4_gemm_fg_scaleonly_bf16_out_bf16.cu +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/e4m3_int4_gemm_fg_scaleonly_bf16_out_bf16.cu @@ -14,10 +14,11 @@ * limitations under the License. */ +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm_template.h" -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { namespace cutlass_kernels @@ -32,4 +33,5 @@ template class CutlassFpAIntBGemmRunner<__nv_fp8_e4m3, /*Activation Type* #endif } // namespace cutlass_kernels } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/e4m3_int4_gemm_fg_scaleonly_f16_out_f16.cu b/cpp/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/e4m3_int4_gemm_fg_scaleonly_f16_out_f16.cu index 392e2e763b..74341a215d 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/e4m3_int4_gemm_fg_scaleonly_f16_out_f16.cu +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/e4m3_int4_gemm_fg_scaleonly_f16_out_f16.cu @@ -14,10 +14,11 @@ * limitations under the License. */ +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm_template.h" -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { namespace cutlass_kernels @@ -32,4 +33,5 @@ template class CutlassFpAIntBGemmRunner<__nv_fp8_e4m3, /*Activation Type* #endif } // namespace cutlass_kernels } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/e4m3_int4_gemm_per_col_f16_out_f16.cu b/cpp/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/e4m3_int4_gemm_per_col_f16_out_f16.cu index e40dd578cf..59d3be75ca 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/e4m3_int4_gemm_per_col_f16_out_f16.cu +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/e4m3_int4_gemm_per_col_f16_out_f16.cu @@ -14,10 +14,11 @@ * limitations under the License. */ +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm_template.h" -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { namespace cutlass_kernels @@ -32,4 +33,5 @@ template class CutlassFpAIntBGemmRunner<__nv_fp8_e4m3, /*Activation Type*/ #endif } // namespace cutlass_kernels } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fp16_int4_gemm_fg_scalebias.cu b/cpp/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fp16_int4_gemm_fg_scalebias.cu index 45e0f4c0f8..74fe659257 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fp16_int4_gemm_fg_scalebias.cu +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fp16_int4_gemm_fg_scalebias.cu @@ -14,10 +14,11 @@ * limitations under the License. */ +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm_template.h" -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { namespace cutlass_kernels @@ -26,4 +27,5 @@ template class CutlassFpAIntBGemmRunner; } // namespace cutlass_kernels } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fp16_int4_gemm_fg_scaleonly.cu b/cpp/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fp16_int4_gemm_fg_scaleonly.cu index 113c6c6174..de1189ce34 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fp16_int4_gemm_fg_scaleonly.cu +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fp16_int4_gemm_fg_scaleonly.cu @@ -14,10 +14,11 @@ * limitations under the License. */ +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm_template.h" -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { namespace cutlass_kernels @@ -25,4 +26,5 @@ namespace cutlass_kernels template class CutlassFpAIntBGemmRunner; } // namespace cutlass_kernels } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fp16_int4_gemm_per_col.cu b/cpp/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fp16_int4_gemm_per_col.cu index 6e69985edc..bb41afea9e 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fp16_int4_gemm_per_col.cu +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fp16_int4_gemm_per_col.cu @@ -14,10 +14,11 @@ * limitations under the License. */ +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm_template.h" -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { namespace cutlass_kernels @@ -25,4 +26,5 @@ namespace cutlass_kernels template class CutlassFpAIntBGemmRunner; } // namespace cutlass_kernels } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fp16_int8_gemm_fg_scalebias.cu b/cpp/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fp16_int8_gemm_fg_scalebias.cu index 51e33974f7..b643e8a043 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fp16_int8_gemm_fg_scalebias.cu +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fp16_int8_gemm_fg_scalebias.cu @@ -14,10 +14,11 @@ * limitations under the License. */ +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm_template.h" -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { namespace cutlass_kernels @@ -25,4 +26,5 @@ namespace cutlass_kernels template class CutlassFpAIntBGemmRunner; } // namespace cutlass_kernels } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fp16_int8_gemm_fg_scaleonly.cu b/cpp/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fp16_int8_gemm_fg_scaleonly.cu index 148cfb519e..3f6cd93988 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fp16_int8_gemm_fg_scaleonly.cu +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fp16_int8_gemm_fg_scaleonly.cu @@ -14,10 +14,11 @@ * limitations under the License. */ +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm_template.h" -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { namespace cutlass_kernels @@ -25,4 +26,5 @@ namespace cutlass_kernels template class CutlassFpAIntBGemmRunner; } // namespace cutlass_kernels } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fp16_int8_gemm_per_col.cu b/cpp/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fp16_int8_gemm_per_col.cu index 35d199f58f..ccc45aa8c1 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fp16_int8_gemm_per_col.cu +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fp16_int8_gemm_per_col.cu @@ -14,10 +14,11 @@ * limitations under the License. */ +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm_template.h" -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { namespace cutlass_kernels @@ -25,4 +26,5 @@ namespace cutlass_kernels template class CutlassFpAIntBGemmRunner; } // namespace cutlass_kernels } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm.h b/cpp/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm.h index de0c9c61bb..3b30dc77d2 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm.h +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm.h @@ -19,13 +19,14 @@ #include "../include/common.h" #include "cutlass_extensions/gemm_configs.h" #include "cutlass_extensions/weight_only_quant_op.h" +#include "tensorrt_llm/common/config.h" #include #include namespace tkc = tensorrt_llm::cutlass_extensions; -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { namespace cutlass_kernels @@ -133,4 +134,5 @@ private: } // namespace cutlass_kernels } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm_template.h b/cpp/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm_template.h index 360da97532..1ebaecaa11 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm_template.h +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm_template.h @@ -22,6 +22,7 @@ #include "cutlass/gemm/kernel/default_gemm.h" #include "cutlass_extensions/compute_occupancy.h" #include "cutlass_extensions/gemm/device/gemm_universal_base_compat.h" +#include "tensorrt_llm/common/config.h" #include "cutlass_extensions/epilogue_helpers.h" #include "cutlass_extensions/gemm/kernel/default_fpA_intB_traits.h" @@ -44,8 +45,8 @@ namespace tk = tensorrt_llm::common; namespace tkc = tensorrt_llm::cutlass_extensions; -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { namespace cutlass_kernels @@ -584,4 +585,5 @@ CutlassFpAIntBGemmRunner -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { namespace cutlass_kernels_oss @@ -36,4 +37,5 @@ void sm90_generic_mixed_gemm_kernelLauncher(ActivationType const* A, WeightType } // namespace cutlass_kernels_oss } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/launchers/fpA_intB_launcher_sm90.inl b/cpp/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/launchers/fpA_intB_launcher_sm90.inl index 94bf6c9648..06f89bf5fd 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/launchers/fpA_intB_launcher_sm90.inl +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/launchers/fpA_intB_launcher_sm90.inl @@ -41,14 +41,15 @@ #endif // __GNUC__ #include "tensorrt_llm/common/assert.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaUtils.h" #include "tensorrt_llm/common/logger.h" #include "tensorrt_llm/kernels/cutlass_kernels/cutlass_heuristic.h" #include "tensorrt_llm/kernels/cutlass_kernels/cutlass_type_conversion.h" #include "tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/launchers/fpA_intB_launcher_sm90.h" -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { namespace cutlass_kernels_oss @@ -298,4 +299,5 @@ void sm90_generic_mixed_gemm_kernelLauncher(ActivationType const* A, WeightType } // namespace cutlass_kernels_oss } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/fused_gated_gemm/fused_gated_gemm.h b/cpp/tensorrt_llm/kernels/cutlass_kernels/fused_gated_gemm/fused_gated_gemm.h index 6e670d2d33..42b2dcae58 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/fused_gated_gemm/fused_gated_gemm.h +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/fused_gated_gemm/fused_gated_gemm.h @@ -17,6 +17,7 @@ #pragma once #include "cutlass_extensions/gemm_configs.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/quantization.h" #include @@ -25,8 +26,8 @@ namespace tk = tensorrt_llm::common; namespace tkc = tensorrt_llm::cutlass_extensions; -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { namespace cutlass_kernels @@ -85,4 +86,5 @@ private: } // namespace cutlass_kernels } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/fused_gated_gemm/fused_gated_gemm_kernel_template_sm90.h b/cpp/tensorrt_llm/kernels/cutlass_kernels/fused_gated_gemm/fused_gated_gemm_kernel_template_sm90.h index 07a8b45096..743cb11b2a 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/fused_gated_gemm/fused_gated_gemm_kernel_template_sm90.h +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/fused_gated_gemm/fused_gated_gemm_kernel_template_sm90.h @@ -20,6 +20,7 @@ #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wstrict-aliasing" #endif // __GNUC__ +#include "tensorrt_llm/common/config.h" #include "cute/tensor.hpp" #include "cutlass/conv/convolution.h" @@ -42,8 +43,8 @@ #pragma GCC diagnostic pop #endif // __GNUC__ -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { namespace cutlass_kernels @@ -118,4 +119,5 @@ struct DeviceGemmGatedSm90 } // namespace cutlass_kernels } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/fused_gated_gemm/fused_gated_gemm_template.h b/cpp/tensorrt_llm/kernels/cutlass_kernels/fused_gated_gemm/fused_gated_gemm_template.h index ce175160a9..d5d8c43233 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/fused_gated_gemm/fused_gated_gemm_template.h +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/fused_gated_gemm/fused_gated_gemm_template.h @@ -20,6 +20,7 @@ #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wstrict-aliasing" #endif // __GNUC__ +#include "tensorrt_llm/common/config.h" #include "cute/tensor.hpp" #include "cutlass/conv/convolution.h" @@ -41,8 +42,8 @@ #include #include -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { namespace cutlass_kernels @@ -446,4 +447,5 @@ size_t CutlassFusedGatedGemmRunner::getWorkspaceSize(int const m, int const n } // namespace cutlass_kernels } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/fused_gated_gemm/gemm_swiglu_e4m3.cu b/cpp/tensorrt_llm/kernels/cutlass_kernels/fused_gated_gemm/gemm_swiglu_e4m3.cu index 2e603cfb15..6a75517567 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/fused_gated_gemm/gemm_swiglu_e4m3.cu +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/fused_gated_gemm/gemm_swiglu_e4m3.cu @@ -15,9 +15,10 @@ */ #include "fused_gated_gemm_template.h" +#include "tensorrt_llm/common/config.h" + +TRTLLM_NAMESPACE_BEGIN -namespace tensorrt_llm -{ namespace kernels { namespace cutlass_kernels @@ -25,4 +26,5 @@ namespace cutlass_kernels template class CutlassFusedGatedGemmRunner<__nv_fp8_e4m3>; } // namespace cutlass_kernels } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/include/allreduce_gemm_runner.h b/cpp/tensorrt_llm/kernels/cutlass_kernels/include/allreduce_gemm_runner.h index 93068447eb..d7c8234839 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/include/allreduce_gemm_runner.h +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/include/allreduce_gemm_runner.h @@ -16,6 +16,7 @@ #pragma once +#include "tensorrt_llm/common/config.h" #include #include #include @@ -25,7 +26,9 @@ #include "cutlass_extensions/gemm_configs.h" #include "tensorrt_llm/kernels/cutlass_kernels/cutlass_type_conversion.h" -namespace tensorrt_llm::kernels::opened_cutlass_kernels +TRTLLM_NAMESPACE_BEGIN + +namespace kernels::opened_cutlass_kernels { using namespace cute; using namespace tensorrt_llm::cutlass_extensions; @@ -248,4 +251,6 @@ private: std::map mGemmRegistry; }; -} // namespace tensorrt_llm::kernels::opened_cutlass_kernels +} // namespace kernels::opened_cutlass_kernels + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/include/common.h b/cpp/tensorrt_llm/kernels/cutlass_kernels/include/common.h index d6e5c38c10..8a9937c620 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/include/common.h +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/include/common.h @@ -16,7 +16,11 @@ #pragma once -namespace tensorrt_llm::kernels::cutlass_kernels +#include "tensorrt_llm/common/config.h" + +TRTLLM_NAMESPACE_BEGIN + +namespace kernels::cutlass_kernels { // IMPORTANT: Keep the same order of activation functions in this enum and the activation functions in @@ -34,4 +38,6 @@ enum class ActivationType Relu2 = 8, }; -} // namespace tensorrt_llm::kernels::cutlass_kernels +} // namespace kernels::cutlass_kernels + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/include/fp4_gemm.h b/cpp/tensorrt_llm/kernels/cutlass_kernels/include/fp4_gemm.h index 94318f2e62..944dbc0227 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/include/fp4_gemm.h +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/include/fp4_gemm.h @@ -16,6 +16,7 @@ #pragma once +#include "tensorrt_llm/common/config.h" #include #include @@ -25,8 +26,8 @@ namespace tk = tensorrt_llm::common; namespace tkc = tensorrt_llm::cutlass_extensions; -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { namespace cutlass_kernels @@ -97,4 +98,5 @@ private: } // namespace cutlass_kernels } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/include/low_latency_gemm.h b/cpp/tensorrt_llm/kernels/cutlass_kernels/include/low_latency_gemm.h index b3e3aafef9..57d59a52a0 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/include/low_latency_gemm.h +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/include/low_latency_gemm.h @@ -17,17 +17,14 @@ #pragma once #include "cutlass_extensions/gemm_configs.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaUtils.h" #include #include -// namespace tk = tensorrt_llm::common; +TRTLLM_NAMESPACE_BEGIN -namespace tkc = tensorrt_llm::cutlass_extensions; - -namespace tensorrt_llm -{ namespace kernels { namespace cutlass_kernels @@ -126,4 +123,4 @@ private: }; // namespace cutlass_kernels }; // namespace kernels -}; // namespace tensorrt_llm +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/include/moe_gemm_kernels.h b/cpp/tensorrt_llm/kernels/cutlass_kernels/include/moe_gemm_kernels.h index aef897c2e9..a2b7c112bd 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/include/moe_gemm_kernels.h +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/include/moe_gemm_kernels.h @@ -15,6 +15,7 @@ */ #pragma once +#include "tensorrt_llm/common/config.h" #include #include #include @@ -35,7 +36,9 @@ #include #endif -namespace tensorrt_llm::kernels::cutlass_kernels +TRTLLM_NAMESPACE_BEGIN + +namespace kernels::cutlass_kernels { template @@ -336,4 +339,6 @@ private: size_t calcMaxWorkspaceSize(int num_experts) const; }; -} // namespace tensorrt_llm::kernels::cutlass_kernels +} // namespace kernels::cutlass_kernels + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/include/moe_kernels.h b/cpp/tensorrt_llm/kernels/cutlass_kernels/include/moe_kernels.h index 1f01636217..c4f3fe61f3 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/include/moe_kernels.h +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/include/moe_kernels.h @@ -25,6 +25,7 @@ #ifdef ENABLE_FP4 #include #endif +#include "tensorrt_llm/common/config.h" #include #include #include @@ -33,7 +34,9 @@ #include #include -namespace tensorrt_llm::kernels +TRTLLM_NAMESPACE_BEGIN + +namespace kernels { // Change to following declarations must sync with lora.h in public repo class LoraImpl; @@ -1016,4 +1019,6 @@ private: void populateRandomBuffer(void* buffer_void, size_t size, cudaStream_t stream); } // namespace cutlass_kernels -} // namespace tensorrt_llm::kernels +} // namespace kernels + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/include/moe_util_kernels.h b/cpp/tensorrt_llm/kernels/cutlass_kernels/include/moe_util_kernels.h index a169bccf20..e902e2c9d6 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/include/moe_util_kernels.h +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/include/moe_util_kernels.h @@ -18,6 +18,7 @@ #include "./moe_gemm_kernels.h" #include "cutlass/gemm/gemm.h" #include "tensorrt_llm/common/assert.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaUtils.h" #include "tensorrt_llm/common/quantization.h" #include "tensorrt_llm/kernels/cutlass_kernels/fp8_blockscale_gemm/fp8_blockscale_gemm.h" @@ -32,7 +33,9 @@ #include #include -namespace tensorrt_llm::kernels +TRTLLM_NAMESPACE_BEGIN + +namespace kernels { namespace cutlass_kernels @@ -71,4 +74,6 @@ void finalizeMoeRoutingKernelLauncher(GemmOutputType const* expanded_permuted_ro cudaStream_t stream); } // namespace cutlass_kernels -} // namespace tensorrt_llm::kernels +} // namespace kernels + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/int8_gemm/int8_gemm.h b/cpp/tensorrt_llm/kernels/cutlass_kernels/int8_gemm/int8_gemm.h index 722f817dbb..2de80db507 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/int8_gemm/int8_gemm.h +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/int8_gemm/int8_gemm.h @@ -17,15 +17,17 @@ #pragma once #include "cutlass_extensions/gemm_configs.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/quantization.h" + #include #include namespace tk = tensorrt_llm::common; namespace tkc = tensorrt_llm::cutlass_extensions; -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { namespace cutlass_kernels @@ -91,4 +93,5 @@ private: } // namespace cutlass_kernels } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/int8_gemm/int8_gemm_bf16.cu b/cpp/tensorrt_llm/kernels/cutlass_kernels/int8_gemm/int8_gemm_bf16.cu index a3633bc099..99c940751e 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/int8_gemm/int8_gemm_bf16.cu +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/int8_gemm/int8_gemm_bf16.cu @@ -14,10 +14,11 @@ * limitations under the License. */ +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/kernels/cutlass_kernels/int8_gemm/int8_gemm_template.h" -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { namespace cutlass_kernels @@ -29,4 +30,5 @@ template class CutlassInt8GemmRunner<__nv_bfloat16>; } // namespace cutlass_kernels } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/int8_gemm/int8_gemm_fp16.cu b/cpp/tensorrt_llm/kernels/cutlass_kernels/int8_gemm/int8_gemm_fp16.cu index 7189956d5d..a1ec5d8d09 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/int8_gemm/int8_gemm_fp16.cu +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/int8_gemm/int8_gemm_fp16.cu @@ -14,10 +14,11 @@ * limitations under the License. */ +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/kernels/cutlass_kernels/int8_gemm/int8_gemm_template.h" -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { namespace cutlass_kernels @@ -27,4 +28,5 @@ template class CutlassInt8GemmRunner; } // namespace cutlass_kernels } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/int8_gemm/int8_gemm_fp32.cu b/cpp/tensorrt_llm/kernels/cutlass_kernels/int8_gemm/int8_gemm_fp32.cu index 861a2d4ff0..5f0c38eeb5 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/int8_gemm/int8_gemm_fp32.cu +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/int8_gemm/int8_gemm_fp32.cu @@ -14,10 +14,11 @@ * limitations under the License. */ +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/kernels/cutlass_kernels/int8_gemm/int8_gemm_template.h" -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { namespace cutlass_kernels @@ -27,4 +28,5 @@ template class CutlassInt8GemmRunner; // for compilation only } // namespace cutlass_kernels } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/int8_gemm/int8_gemm_int32.cu b/cpp/tensorrt_llm/kernels/cutlass_kernels/int8_gemm/int8_gemm_int32.cu index 6814b00e02..f8511d7d0b 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/int8_gemm/int8_gemm_int32.cu +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/int8_gemm/int8_gemm_int32.cu @@ -14,10 +14,11 @@ * limitations under the License. */ +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/kernels/cutlass_kernels/int8_gemm/int8_gemm_template.h" -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { namespace cutlass_kernels @@ -27,4 +28,5 @@ template class CutlassInt8GemmRunner; } // namespace cutlass_kernels } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/int8_gemm/int8_gemm_template.h b/cpp/tensorrt_llm/kernels/cutlass_kernels/int8_gemm/int8_gemm_template.h index 1f5fedc6fa..b542b0ab32 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/int8_gemm/int8_gemm_template.h +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/int8_gemm/int8_gemm_template.h @@ -40,6 +40,7 @@ #pragma GCC diagnostic pop #endif // __GNUC__ +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaUtils.h" #include "tensorrt_llm/kernels/cutlass_kernels/cutlass_heuristic.h" #include "tensorrt_llm/kernels/cutlass_kernels/int8_gemm/int8_gemm.h" @@ -51,8 +52,8 @@ namespace tk = tensorrt_llm::common; namespace tkc = tensorrt_llm::cutlass_extensions; -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { namespace cutlass_kernels @@ -383,4 +384,5 @@ size_t CutlassInt8GemmRunner::getWorkspaceSize(int const m, int const n, int } // namespace cutlass_kernels } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/low_latency_gemm/fp8_low_latency_gemm_template.h b/cpp/tensorrt_llm/kernels/cutlass_kernels/low_latency_gemm/fp8_low_latency_gemm_template.h index 2395650223..6b14af0fd1 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/low_latency_gemm/fp8_low_latency_gemm_template.h +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/low_latency_gemm/fp8_low_latency_gemm_template.h @@ -51,6 +51,7 @@ #pragma GCC diagnostic pop #endif // __GNUC__ +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaUtils.h" #include "tensorrt_llm/common/logger.h" #include "tensorrt_llm/kernels/cutlass_kernels/cutlass_heuristic.h" @@ -64,8 +65,7 @@ namespace tk = tensorrt_llm::common; namespace tkc = tensorrt_llm::cutlass_extensions; -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN namespace kernels { @@ -554,4 +554,4 @@ std::vector CutlassLowLatencyFp8GemmRunner::getConfigs() const }; // namespace cutlass_kernels }; // namespace kernels -}; // namespace tensorrt_llm +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/low_latency_gemm/low_latency_fp8_gemm_bf16.cu b/cpp/tensorrt_llm/kernels/cutlass_kernels/low_latency_gemm/low_latency_fp8_gemm_bf16.cu index b58d5a1731..edd990c94c 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/low_latency_gemm/low_latency_fp8_gemm_bf16.cu +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/low_latency_gemm/low_latency_fp8_gemm_bf16.cu @@ -15,9 +15,10 @@ */ #include "fp8_low_latency_gemm_template.h" +#include "tensorrt_llm/common/config.h" + +TRTLLM_NAMESPACE_BEGIN -namespace tensorrt_llm -{ namespace kernels { namespace cutlass_kernels @@ -27,4 +28,5 @@ template class CutlassLowLatencyFp8GemmRunner<__nv_bfloat16>; // for compilation } // namespace cutlass_kernels } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/low_latency_gemm/low_latency_fp8_gemm_fp16.cu b/cpp/tensorrt_llm/kernels/cutlass_kernels/low_latency_gemm/low_latency_fp8_gemm_fp16.cu index 2a9e07721f..98017f5930 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/low_latency_gemm/low_latency_fp8_gemm_fp16.cu +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/low_latency_gemm/low_latency_fp8_gemm_fp16.cu @@ -15,9 +15,10 @@ */ #include "fp8_low_latency_gemm_template.h" +#include "tensorrt_llm/common/config.h" + +TRTLLM_NAMESPACE_BEGIN -namespace tensorrt_llm -{ namespace kernels { namespace cutlass_kernels @@ -27,4 +28,5 @@ template class CutlassLowLatencyFp8GemmRunner; // for compilation only } // namespace cutlass_kernels } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/low_latency_gemm/low_latency_fp8_gemm_fp32.cu b/cpp/tensorrt_llm/kernels/cutlass_kernels/low_latency_gemm/low_latency_fp8_gemm_fp32.cu index a29b4e9bad..66dfb2596b 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/low_latency_gemm/low_latency_fp8_gemm_fp32.cu +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/low_latency_gemm/low_latency_fp8_gemm_fp32.cu @@ -15,9 +15,10 @@ */ #include "fp8_low_latency_gemm_template.h" +#include "tensorrt_llm/common/config.h" + +TRTLLM_NAMESPACE_BEGIN -namespace tensorrt_llm -{ namespace kernels { namespace cutlass_kernels @@ -27,4 +28,5 @@ template class CutlassLowLatencyFp8GemmRunner; // for compilation only } // namespace cutlass_kernels } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/launchers/fused_moe_gemm_launcher_sm80.h b/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/launchers/fused_moe_gemm_launcher_sm80.h index efc7d359f8..49cd2ea262 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/launchers/fused_moe_gemm_launcher_sm80.h +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/launchers/fused_moe_gemm_launcher_sm80.h @@ -14,7 +14,11 @@ * limitations under the License. */ -namespace tensorrt_llm::kernels::cutlass_kernels_oss +#include "tensorrt_llm/common/config.h" + +TRTLLM_NAMESPACE_BEGIN + +namespace kernels::cutlass_kernels_oss { template @@ -22,4 +26,6 @@ void sm80_generic_fused_moe_gemm_kernelLauncher(ElementType_ const* A, CutlassWe ElementType_ const* biases, bool bias_is_broadcast, ElementType_* C, int64_t const* total_tokens_including_expert, int64_t num_rows, int64_t gemm_n, int64_t gemm_k, int num_experts, int multi_processor_count, cudaStream_t stream, int* kernel_occupancy); -} +} // namespace kernels::cutlass_kernels_oss + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/launchers/fused_moe_gemm_launcher_sm80.inl b/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/launchers/fused_moe_gemm_launcher_sm80.inl index 85c2f00a54..2d112fb44c 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/launchers/fused_moe_gemm_launcher_sm80.inl +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/launchers/fused_moe_gemm_launcher_sm80.inl @@ -25,9 +25,12 @@ #include "cutlass_extensions/epilogue_helpers.h" #include "cutlass_extensions/gemm/kernel/fused_moe_kernel.cuh" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaUtils.h" -namespace tensorrt_llm::kernels::cutlass_kernels_oss +TRTLLM_NAMESPACE_BEGIN + +namespace kernels::cutlass_kernels_oss { template @@ -93,4 +96,6 @@ void sm80_generic_fused_moe_gemm_kernelLauncher(ElementType_ const* A, CutlassWe auto result = cudaGetLastError(); TLLM_CHECK_WITH_INFO(result == cudaSuccess, "Fail to execute fused moe kernel, cuda error %d\n", (int) (result)); } -} // namespace tensorrt_llm::kernels::cutlass_kernels_oss +} // namespace kernels::cutlass_kernels_oss + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/launchers/moe_gemm_tma_ws_launcher.h b/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/launchers/moe_gemm_tma_ws_launcher.h index 87fa89373e..77b809d0f0 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/launchers/moe_gemm_tma_ws_launcher.h +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/launchers/moe_gemm_tma_ws_launcher.h @@ -17,9 +17,12 @@ #pragma once #include "../../include/moe_gemm_kernels.h" +#include "tensorrt_llm/common/config.h" #include -namespace tensorrt_llm::kernels::cutlass_kernels_oss +TRTLLM_NAMESPACE_BEGIN + +namespace kernels::cutlass_kernels_oss { using tensorrt_llm::kernels::cutlass_kernels::TmaWarpSpecializedGroupedGemmInput; // Keep in sync with the signature generated by generate_kernels.py @@ -31,4 +34,6 @@ void tma_warp_specialized_generic_moe_gemm_kernelLauncher(TmaWarpSpecializedGrou cute::Shape dynamic_cluster_shape, cute::Shape fallback_cluster_shape); -} // namespace tensorrt_llm::kernels::cutlass_kernels_oss +} // namespace kernels::cutlass_kernels_oss + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/launchers/moe_gemm_tma_ws_launcher.inl b/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/launchers/moe_gemm_tma_ws_launcher.inl index e8f61e300a..56552a484b 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/launchers/moe_gemm_tma_ws_launcher.inl +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/launchers/moe_gemm_tma_ws_launcher.inl @@ -36,6 +36,7 @@ #include "cutlass_extensions/epilogue/fusion/sm90_visitor_scatter.hpp" #include "tensorrt_llm/common/assert.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaUtils.h" #include "tensorrt_llm/common/envUtils.h" #include "tensorrt_llm/kernels/cutlass_kernels/cutlass_heuristic.h" @@ -55,8 +56,8 @@ #include #include -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { namespace cutlass_kernels_oss @@ -709,4 +710,5 @@ using namespace cutlass::epilogue; } // namespace cutlass_kernels_oss } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/launchers/moe_gemm_tma_ws_mixed_input_launcher.h b/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/launchers/moe_gemm_tma_ws_mixed_input_launcher.h index 2b6b3a81cd..f2d6bcfa3e 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/launchers/moe_gemm_tma_ws_mixed_input_launcher.h +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/launchers/moe_gemm_tma_ws_mixed_input_launcher.h @@ -17,10 +17,11 @@ #include "../../include/moe_gemm_kernels.h" #include "cutlass_extensions/gemm_configs.h" #include "cutlass_extensions/weight_only_quant_op.h" +#include "tensorrt_llm/common/config.h" #include -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { namespace cutlass_kernels_oss @@ -36,4 +37,5 @@ void sm90_generic_mixed_moe_gemm_kernelLauncher( } // namespace cutlass_kernels_oss } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/launchers/moe_gemm_tma_ws_mixed_input_launcher.inl b/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/launchers/moe_gemm_tma_ws_mixed_input_launcher.inl index 528c3584a6..86e61c56b2 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/launchers/moe_gemm_tma_ws_mixed_input_launcher.inl +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/launchers/moe_gemm_tma_ws_mixed_input_launcher.inl @@ -54,6 +54,7 @@ #endif // __GNUC__ #include "tensorrt_llm/common/assert.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaUtils.h" #include "tensorrt_llm/common/logger.h" #include "tensorrt_llm/kernels/cutlass_kernels/cutlass_heuristic.h" @@ -61,8 +62,8 @@ #include "moe_gemm_tma_ws_mixed_input_launcher.h" -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { namespace cutlass_kernels_oss @@ -246,4 +247,5 @@ void sm90_generic_mixed_moe_gemm_kernelLauncher(GroupedGemmInput; #endif -} // namespace tensorrt_llm::kernels::cutlass_kernels +} // namespace kernels::cutlass_kernels + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_bf16_fp4.cu b/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_bf16_fp4.cu index be29019bc6..5e090906c0 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_bf16_fp4.cu +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_bf16_fp4.cu @@ -15,10 +15,15 @@ */ #include "moe_gemm_template_dispatch.h" +#include "tensorrt_llm/common/config.h" -namespace tensorrt_llm::kernels::cutlass_kernels +TRTLLM_NAMESPACE_BEGIN + +namespace kernels::cutlass_kernels { #ifdef ENABLE_BF16 template class MoeGemmRunner<__nv_bfloat16, __nv_fp4_e2m1, __nv_bfloat16>; #endif -} // namespace tensorrt_llm::kernels::cutlass_kernels +} // namespace kernels::cutlass_kernels + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_bf16_fp8.cu b/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_bf16_fp8.cu index 69ea5c6326..40d5b3e68c 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_bf16_fp8.cu +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_bf16_fp8.cu @@ -15,10 +15,15 @@ */ #include "moe_gemm_template_dispatch.h" +#include "tensorrt_llm/common/config.h" -namespace tensorrt_llm::kernels::cutlass_kernels +TRTLLM_NAMESPACE_BEGIN + +namespace kernels::cutlass_kernels { #ifdef ENABLE_BF16 template class MoeGemmRunner<__nv_bfloat16, __nv_fp8_e4m3, __nv_bfloat16>; #endif -} // namespace tensorrt_llm::kernels::cutlass_kernels +} // namespace kernels::cutlass_kernels + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_bf16_uint4.cu b/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_bf16_uint4.cu index cbb8dba108..50480e1f2e 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_bf16_uint4.cu +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_bf16_uint4.cu @@ -15,10 +15,15 @@ */ #include "moe_gemm_template_dispatch.h" +#include "tensorrt_llm/common/config.h" -namespace tensorrt_llm::kernels::cutlass_kernels +TRTLLM_NAMESPACE_BEGIN + +namespace kernels::cutlass_kernels { #ifdef ENABLE_BF16 template class MoeGemmRunner<__nv_bfloat16, cutlass::uint4b_t, __nv_bfloat16>; #endif -} // namespace tensorrt_llm::kernels::cutlass_kernels +} // namespace kernels::cutlass_kernels + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_bf16_uint8.cu b/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_bf16_uint8.cu index e642d785dc..e129d569fe 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_bf16_uint8.cu +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_bf16_uint8.cu @@ -15,10 +15,15 @@ */ #include "moe_gemm_template_dispatch.h" +#include "tensorrt_llm/common/config.h" -namespace tensorrt_llm::kernels::cutlass_kernels +TRTLLM_NAMESPACE_BEGIN + +namespace kernels::cutlass_kernels { #ifdef ENABLE_BF16 template class MoeGemmRunner<__nv_bfloat16, uint8_t, __nv_bfloat16>; #endif -} // namespace tensorrt_llm::kernels::cutlass_kernels +} // namespace kernels::cutlass_kernels + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_fp16_fp16.cu b/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_fp16_fp16.cu index a47b9f18a9..4e4f87d344 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_fp16_fp16.cu +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_fp16_fp16.cu @@ -15,8 +15,13 @@ */ #include "moe_gemm_template_dispatch.h" +#include "tensorrt_llm/common/config.h" -namespace tensorrt_llm::kernels::cutlass_kernels +TRTLLM_NAMESPACE_BEGIN + +namespace kernels::cutlass_kernels { template class MoeGemmRunner; } + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_fp16_fp4.cu b/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_fp16_fp4.cu index f1a885ea77..9afe0dda88 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_fp16_fp4.cu +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_fp16_fp4.cu @@ -15,8 +15,13 @@ */ #include "moe_gemm_template_dispatch.h" +#include "tensorrt_llm/common/config.h" -namespace tensorrt_llm::kernels::cutlass_kernels +TRTLLM_NAMESPACE_BEGIN + +namespace kernels::cutlass_kernels { template class MoeGemmRunner; } + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_fp16_uint4.cu b/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_fp16_uint4.cu index 234fcc81ae..f8de82e5b1 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_fp16_uint4.cu +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_fp16_uint4.cu @@ -15,8 +15,13 @@ */ #include "moe_gemm_template_dispatch.h" +#include "tensorrt_llm/common/config.h" -namespace tensorrt_llm::kernels::cutlass_kernels +TRTLLM_NAMESPACE_BEGIN + +namespace kernels::cutlass_kernels { template class MoeGemmRunner; } + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_fp16_uint8.cu b/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_fp16_uint8.cu index 5448f53271..e8cd6f186e 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_fp16_uint8.cu +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_fp16_uint8.cu @@ -15,8 +15,13 @@ */ #include "moe_gemm_template_dispatch.h" +#include "tensorrt_llm/common/config.h" -namespace tensorrt_llm::kernels::cutlass_kernels +TRTLLM_NAMESPACE_BEGIN + +namespace kernels::cutlass_kernels { template class MoeGemmRunner; } + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_fp32_fp32.cu b/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_fp32_fp32.cu index 3f858564cf..01d8c736a7 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_fp32_fp32.cu +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_fp32_fp32.cu @@ -15,8 +15,13 @@ */ #include "moe_gemm_template_dispatch.h" +#include "tensorrt_llm/common/config.h" -namespace tensorrt_llm::kernels::cutlass_kernels +TRTLLM_NAMESPACE_BEGIN + +namespace kernels::cutlass_kernels { template class MoeGemmRunner; } + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_fp4_fp4.cu b/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_fp4_fp4.cu index 5c6222f3b4..449e9eec0e 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_fp4_fp4.cu +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_fp4_fp4.cu @@ -15,8 +15,11 @@ */ #include "moe_gemm_template_dispatch.h" +#include "tensorrt_llm/common/config.h" -namespace tensorrt_llm::kernels::cutlass_kernels +TRTLLM_NAMESPACE_BEGIN + +namespace kernels::cutlass_kernels { #ifdef ENABLE_FP4 template class MoeGemmRunner<__nv_fp4_e2m1, __nv_fp4_e2m1, half>; @@ -24,4 +27,6 @@ template class MoeGemmRunner<__nv_fp4_e2m1, __nv_fp4_e2m1, half>; template class MoeGemmRunner<__nv_fp4_e2m1, __nv_fp4_e2m1, __nv_bfloat16>; #endif #endif -} // namespace tensorrt_llm::kernels::cutlass_kernels +} // namespace kernels::cutlass_kernels + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_fp8_fp4.cu b/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_fp8_fp4.cu index 1238517077..0ebaacdba3 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_fp8_fp4.cu +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_fp8_fp4.cu @@ -15,8 +15,11 @@ */ #include "moe_gemm_template_dispatch.h" +#include "tensorrt_llm/common/config.h" -namespace tensorrt_llm::kernels::cutlass_kernels +TRTLLM_NAMESPACE_BEGIN + +namespace kernels::cutlass_kernels { #ifdef ENABLE_FP4 template class MoeGemmRunner<__nv_fp8_e4m3, __nv_fp4_e2m1, half>; @@ -24,4 +27,6 @@ template class MoeGemmRunner<__nv_fp8_e4m3, __nv_fp4_e2m1, half>; template class MoeGemmRunner<__nv_fp8_e4m3, __nv_fp4_e2m1, __nv_bfloat16>; #endif #endif -} // namespace tensorrt_llm::kernels::cutlass_kernels +} // namespace kernels::cutlass_kernels + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_fp8_fp8.cu b/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_fp8_fp8.cu index 9d86df55fc..2ab4ac4f89 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_fp8_fp8.cu +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_fp8_fp8.cu @@ -15,8 +15,11 @@ */ #include "moe_gemm_template_dispatch.h" +#include "tensorrt_llm/common/config.h" -namespace tensorrt_llm::kernels::cutlass_kernels +TRTLLM_NAMESPACE_BEGIN + +namespace kernels::cutlass_kernels { #ifdef ENABLE_FP8 template class MoeGemmRunner<__nv_fp8_e4m3, __nv_fp8_e4m3, half>; @@ -25,4 +28,6 @@ template class MoeGemmRunner<__nv_fp8_e4m3, __nv_fp8_e4m3, __nv_bfloat16>; #endif // template class MoeGemmRunner<__nv_fp8_e5m2, __nv_fp8_e5m2>; #endif -} // namespace tensorrt_llm::kernels::cutlass_kernels +} // namespace kernels::cutlass_kernels + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_fp8_uint4.cu b/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_fp8_uint4.cu index 812f909493..f749ca9263 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_fp8_uint4.cu +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_fp8_uint4.cu @@ -15,8 +15,11 @@ */ #include "moe_gemm_template_dispatch.h" +#include "tensorrt_llm/common/config.h" -namespace tensorrt_llm::kernels::cutlass_kernels +TRTLLM_NAMESPACE_BEGIN + +namespace kernels::cutlass_kernels { #ifdef ENABLE_FP8 template class MoeGemmRunner<__nv_fp8_e4m3, cutlass::uint4b_t, half>; @@ -24,4 +27,6 @@ template class MoeGemmRunner<__nv_fp8_e4m3, cutlass::uint4b_t, half>; template class MoeGemmRunner<__nv_fp8_e4m3, cutlass::uint4b_t, __nv_bfloat16>; #endif #endif -} // namespace tensorrt_llm::kernels::cutlass_kernels +} // namespace kernels::cutlass_kernels + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_template_dispatch.h b/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_template_dispatch.h index 95b55e3d84..33ece54627 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_template_dispatch.h +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_template_dispatch.h @@ -53,6 +53,7 @@ #endif #include "tensorrt_llm/common/assert.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaUtils.h" #include "tensorrt_llm/common/logger.h" @@ -73,7 +74,9 @@ #include #include -namespace tensorrt_llm::kernels::cutlass_kernels_oss +TRTLLM_NAMESPACE_BEGIN + +namespace kernels::cutlass_kernels_oss { // ============================= Variable batched Gemm things =========================== @@ -473,9 +476,9 @@ void dispatchMoeGemmToCutlass(GroupedGemmInput @@ -967,4 +970,6 @@ void MoeGemmRunner::moeGemm( runGemm(inputs, hopper_inputs); } -} // namespace tensorrt_llm::kernels::cutlass_kernels +} // namespace kernels::cutlass_kernels + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_template_dispatch_tma_ws.h b/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_template_dispatch_tma_ws.h index 65fff6a285..339f95a96d 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_template_dispatch_tma_ws.h +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_template_dispatch_tma_ws.h @@ -51,6 +51,7 @@ #endif #include "tensorrt_llm/common/assert.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaUtils.h" #include "tensorrt_llm/kernels/cutlass_kernels/cutlass_heuristic.h" #include "tensorrt_llm/kernels/cutlass_kernels/cutlass_type_conversion.h" @@ -65,7 +66,9 @@ #include #include -namespace tensorrt_llm::kernels::cutlass_kernels_oss +TRTLLM_NAMESPACE_BEGIN + +namespace kernels::cutlass_kernels_oss { using tensorrt_llm::kernels::cutlass_kernels::TmaWarpSpecializedGroupedGemmInput; using EpilogueFusion = TmaWarpSpecializedGroupedGemmInput::EpilogueFusion; @@ -382,7 +385,7 @@ void dispatchMoeGemmSelectClusterShapeTmaWarpSpecialized(TmaWarpSpecializedGroup #undef SHAPE_CASE default: TLLM_THROW("Unsupported cluster shape config %d for MoE gemm.", (int) gemm_config.cluster_shape); } -} // namespace tensorrt_llm +} template void dispatchMoeGemmSelectTileShapeTmaWarpSpecialized(TmaWarpSpecializedGroupedGemmInput hopper_input, int num_experts, @@ -511,4 +514,6 @@ size_t calcMaxWorkspaceSizeTmaWarpSpecialized(int num_experts, cutlass_extension return count; } -} // namespace tensorrt_llm::kernels::cutlass_kernels_oss +} // namespace kernels::cutlass_kernels_oss + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_template_dispatch_tma_ws_mixed_dtype.h b/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_template_dispatch_tma_ws_mixed_dtype.h index 1ee7232c9e..c4265766b4 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_template_dispatch_tma_ws_mixed_dtype.h +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_template_dispatch_tma_ws_mixed_dtype.h @@ -49,6 +49,7 @@ #include "../include/moe_gemm_kernels.h" #include "launchers/moe_gemm_tma_ws_mixed_input_launcher.h" #include "tensorrt_llm/common/assert.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaUtils.h" #include "tensorrt_llm/kernels/cutlass_kernels/cutlass_heuristic.h" @@ -57,7 +58,9 @@ #include #include -namespace tensorrt_llm::kernels::cutlass_kernels_oss +TRTLLM_NAMESPACE_BEGIN + +namespace kernels::cutlass_kernels_oss { using tensorrt_llm::kernels::cutlass_kernels::GroupedGemmInput; @@ -244,4 +247,6 @@ size_t calcMaxWorkspaceSizeTmaWarpSpecializedMixedInput(int num_experts, int sm_ return count; } -} // namespace tensorrt_llm::kernels::cutlass_kernels_oss +} // namespace kernels::cutlass_kernels_oss + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_tma_warp_specialized_input.cu b/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_tma_warp_specialized_input.cu index 59cf79f136..fd3ef0aac6 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_tma_warp_specialized_input.cu +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_tma_warp_specialized_input.cu @@ -15,6 +15,7 @@ */ #include "../include/moe_gemm_kernels.h" +#include "tensorrt_llm/common/config.h" #include "cutlass/cutlass.h" @@ -25,7 +26,9 @@ #include "tensorrt_llm/common/logger.h" -namespace tensorrt_llm::kernels::cutlass_kernels +TRTLLM_NAMESPACE_BEGIN + +namespace kernels::cutlass_kernels { std::array TmaWarpSpecializedGroupedGemmInput::workspaceBuffers( int num_experts, FpXBlockScalingType scaling_type) @@ -166,4 +169,6 @@ std::string TmaWarpSpecializedGroupedGemmInput::toString() const return ss.str(); } -} // namespace tensorrt_llm::kernels::cutlass_kernels +} // namespace kernels::cutlass_kernels + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_kernels.cu b/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_kernels.cu index 76c7c58586..32332ec325 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_kernels.cu +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_kernels.cu @@ -14,6 +14,7 @@ * limitations under the License. */ +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/memoryUtils.h" #include "tensorrt_llm/common/workspace.h" #include @@ -71,7 +72,9 @@ using namespace tensorrt_llm::kernels; using namespace tensorrt_llm::common; -namespace tensorrt_llm::kernels::cutlass_kernels +TRTLLM_NAMESPACE_BEGIN + +namespace kernels::cutlass_kernels { /** * Takes the input maps and prepares the expanded maps for min latency @@ -4747,4 +4750,6 @@ template class CutlassMoeFCRunner<__nv_bfloat16, __nv_fp4_e2m1>; #endif #endif -} // namespace tensorrt_llm::kernels::cutlass_kernels +} // namespace kernels::cutlass_kernels + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_kernels.cuh b/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_kernels.cuh index 0a752f7b1f..36e271228d 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_kernels.cuh +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_kernels.cuh @@ -15,11 +15,14 @@ */ #pragma once +#include "tensorrt_llm/common/config.h" #include #include "cutlass/epilogue/thread/activation.h" -namespace tensorrt_llm::kernels::cutlass_kernels +TRTLLM_NAMESPACE_BEGIN + +namespace kernels::cutlass_kernels { // ============================== Activation Adaptors ================================= @@ -72,4 +75,6 @@ struct SwigluBiasAdaptor } }; -} // namespace tensorrt_llm::kernels::cutlass_kernels +} // namespace kernels::cutlass_kernels + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_tma_warp_specialized_traits.h b/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_tma_warp_specialized_traits.h index a662030ac2..a96a43a964 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_tma_warp_specialized_traits.h +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_tma_warp_specialized_traits.h @@ -19,12 +19,15 @@ #include "../include/moe_gemm_kernels.h" #include "cutlass/arch/mma_sm90.h" #include "cutlass_extensions/epilogue_helpers.h" +#include "tensorrt_llm/common/config.h" #ifdef ENABLE_FP4 #include #endif -namespace tensorrt_llm::kernels::cutlass_kernels +TRTLLM_NAMESPACE_BEGIN + +namespace kernels::cutlass_kernels { // Blackwell arch @@ -103,4 +106,6 @@ constexpr bool isValidAmpereMOESpecialisation() #endif } -} // namespace tensorrt_llm::kernels::cutlass_kernels +} // namespace kernels::cutlass_kernels + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/python/generate_kernels.py b/cpp/tensorrt_llm/kernels/cutlass_kernels/python/generate_kernels.py index 85012c79ba..61070281c4 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/python/generate_kernels.py +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/python/generate_kernels.py @@ -308,8 +308,8 @@ def get_file_content(launcher_inl_files, operations): instantiations = "\n".join(insts_list) file_content = f"""{includes} -namespace tensorrt_llm -{{ +#include "tensorrt_llm/common/config.h" +TRTLLM_NAMESPACE_BEGIN namespace kernels {{ namespace cutlass_kernels_oss @@ -319,7 +319,7 @@ namespace cutlass_kernels_oss }} // namespace cutlass_kernels_oss }} // namespace kernels -}} // namespace tensorrt_llm +TRTLLM_NAMESPACE_END """ return file_content diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention.cu b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention.cu index 7791499fd1..b2b6149d29 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention.cu +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention.cu @@ -14,13 +14,14 @@ * limitations under the License. */ +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/kernels/decoderMaskedMultiheadAttention.h" #include #include #include -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { namespace mmha @@ -176,4 +177,5 @@ INSTANTIATE_MMHA_NORMAL_AND_PAGED(__nv_bfloat16, false) //////////////////////////////////////////////////////////////////////////////////////////////////// } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention.h b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention.h index 3f2705f2ee..9ef6593d16 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention.h +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention.h @@ -16,6 +16,7 @@ #pragma once +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaBf16Wrapper.h" #include "tensorrt_llm/common/cudaFp8Utils.h" #include "tensorrt_llm/kernels/gptKernels.h" @@ -26,8 +27,8 @@ #include #include -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { @@ -294,4 +295,5 @@ inline int estimate_min_multi_block_count(int max_timesteps, int max_dynamic_shm } } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/cubin/xqa_kernel_cubin.cpp b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/cubin/xqa_kernel_cubin.cpp new file mode 100644 index 0000000000..5cf342347f --- /dev/null +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/cubin/xqa_kernel_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e10afcbcfe15eb73c30612fa13d6a75d45e4a7fe2c5c4ec32ca4643a1508f214 +size 273632 diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/cubin/xqa_kernel_cubin.h b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/cubin/xqa_kernel_cubin.h index d39f5adc5d..875aaee182 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/cubin/xqa_kernel_cubin.h +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/cubin/xqa_kernel_cubin.h @@ -14,1264 +14,15 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -namespace tensorrt_llm -{ + +#include "tensorrt_llm/common/config.h" +#include + +TRTLLM_NAMESPACE_BEGIN + namespace kernels { -// clang-format off -// SingleQueryToken kernels. -#ifndef EXCLUDE_SM_80 -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_nqpkv_8_m_8_sm_80_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_64_nqpkv_8_m_8_sm_80_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_128_nqpkv_8_m_8_sm_80_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_nqpkv_8_m_8_sm_80_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_8_m_8_sm_80_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_8_m_8_sm_80_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_nqpkv_8_m_8_sm_80_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_pagedKV_64_nqpkv_8_m_8_sm_80_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_pagedKV_128_nqpkv_8_m_8_sm_80_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_nqpkv_8_m_8_sm_80_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_pagedKV_64_nqpkv_8_m_8_sm_80_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_pagedKV_128_nqpkv_8_m_8_sm_80_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_nqpkv_8_m_8_sm_80_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_64_nqpkv_8_m_8_sm_80_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_128_nqpkv_8_m_8_sm_80_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_nqpkv_8_m_8_sm_80_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_8_m_8_sm_80_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_8_m_8_sm_80_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_nqpkv_8_m_8_sm_80_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_pagedKV_64_nqpkv_8_m_8_sm_80_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_pagedKV_128_nqpkv_8_m_8_sm_80_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_nqpkv_8_m_8_sm_80_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_pagedKV_64_nqpkv_8_m_8_sm_80_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_pagedKV_128_nqpkv_8_m_8_sm_80_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_nqpkv_0_m_16_sm_80_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_nqpkv_0_m_32_sm_80_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_64_nqpkv_0_m_16_sm_80_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_64_nqpkv_0_m_32_sm_80_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_128_nqpkv_0_m_16_sm_80_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_128_nqpkv_0_m_32_sm_80_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_nqpkv_0_m_16_sm_80_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_nqpkv_0_m_32_sm_80_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_16_sm_80_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_32_sm_80_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_0_m_16_sm_80_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_0_m_32_sm_80_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_nqpkv_0_m_16_sm_80_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_nqpkv_0_m_32_sm_80_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_64_nqpkv_0_m_16_sm_80_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_64_nqpkv_0_m_32_sm_80_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_128_nqpkv_0_m_16_sm_80_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_128_nqpkv_0_m_32_sm_80_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_nqpkv_0_m_16_sm_80_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_nqpkv_0_m_32_sm_80_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_16_sm_80_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_32_sm_80_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_0_m_16_sm_80_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_0_m_32_sm_80_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_nqpkv_16_m_16_sm_80_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_64_nqpkv_16_m_16_sm_80_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_128_nqpkv_16_m_16_sm_80_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_nqpkv_16_m_16_sm_80_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_16_m_16_sm_80_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_16_m_16_sm_80_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_nqpkv_16_m_16_sm_80_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_pagedKV_64_nqpkv_16_m_16_sm_80_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_pagedKV_128_nqpkv_16_m_16_sm_80_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_nqpkv_16_m_16_sm_80_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_pagedKV_64_nqpkv_16_m_16_sm_80_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_pagedKV_128_nqpkv_16_m_16_sm_80_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_nqpkv_16_m_16_sm_80_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_64_nqpkv_16_m_16_sm_80_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_128_nqpkv_16_m_16_sm_80_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_nqpkv_16_m_16_sm_80_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_16_m_16_sm_80_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_16_m_16_sm_80_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_nqpkv_16_m_16_sm_80_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_pagedKV_64_nqpkv_16_m_16_sm_80_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_pagedKV_128_nqpkv_16_m_16_sm_80_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_nqpkv_16_m_16_sm_80_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_pagedKV_64_nqpkv_16_m_16_sm_80_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_pagedKV_128_nqpkv_16_m_16_sm_80_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_4_kvt_fp16_pagedKV_64_nqpkv_1_m_1_sm_80_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_4_kvt_fp16_pagedKV_128_nqpkv_1_m_1_sm_80_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_4_kvt_int8_pagedKV_64_nqpkv_1_m_1_sm_80_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_4_kvt_int8_pagedKV_128_nqpkv_1_m_1_sm_80_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_4_kvt_bf16_pagedKV_64_nqpkv_1_m_1_sm_80_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_4_kvt_bf16_pagedKV_128_nqpkv_1_m_1_sm_80_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_4_kvt_int8_pagedKV_64_nqpkv_1_m_1_sm_80_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_4_kvt_int8_pagedKV_128_nqpkv_1_m_1_sm_80_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_64_beam_1_kvt_fp16_pagedKV_64_nqpkv_0_m_16_sm_80_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_64_beam_1_kvt_fp16_pagedKV_64_nqpkv_0_m_32_sm_80_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_64_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_16_sm_80_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_64_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_32_sm_80_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_64_beam_1_kvt_bf16_pagedKV_64_nqpkv_0_m_16_sm_80_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_64_beam_1_kvt_bf16_pagedKV_64_nqpkv_0_m_32_sm_80_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_64_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_16_sm_80_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_64_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_32_sm_80_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_32_nqpkv_0_m_16_sm_80_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_32_nqpkv_0_m_32_sm_80_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_16_sm_80_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_32_sm_80_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_32_nqpkv_0_m_16_sm_80_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_32_nqpkv_0_m_32_sm_80_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_16_sm_80_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_32_sm_80_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_64_beam_1_kvt_fp16_pagedKV_32_nqpkv_0_m_16_sm_80_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_64_beam_1_kvt_fp16_pagedKV_32_nqpkv_0_m_32_sm_80_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_64_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_16_sm_80_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_64_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_32_sm_80_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_64_beam_1_kvt_bf16_pagedKV_32_nqpkv_0_m_16_sm_80_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_64_beam_1_kvt_bf16_pagedKV_32_nqpkv_0_m_32_sm_80_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_64_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_16_sm_80_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_64_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_32_sm_80_cubin[]; - -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_nqpkv_8_m_8_sm_80_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_64_nqpkv_8_m_8_sm_80_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_128_nqpkv_8_m_8_sm_80_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_nqpkv_8_m_8_sm_80_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_8_m_8_sm_80_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_8_m_8_sm_80_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_nqpkv_8_m_8_sm_80_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_pagedKV_64_nqpkv_8_m_8_sm_80_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_pagedKV_128_nqpkv_8_m_8_sm_80_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_nqpkv_8_m_8_sm_80_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_pagedKV_64_nqpkv_8_m_8_sm_80_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_pagedKV_128_nqpkv_8_m_8_sm_80_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_nqpkv_8_m_8_sm_80_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_64_nqpkv_8_m_8_sm_80_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_128_nqpkv_8_m_8_sm_80_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_nqpkv_8_m_8_sm_80_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_8_m_8_sm_80_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_8_m_8_sm_80_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_nqpkv_8_m_8_sm_80_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_pagedKV_64_nqpkv_8_m_8_sm_80_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_pagedKV_128_nqpkv_8_m_8_sm_80_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_nqpkv_8_m_8_sm_80_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_pagedKV_64_nqpkv_8_m_8_sm_80_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_pagedKV_128_nqpkv_8_m_8_sm_80_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_nqpkv_0_m_16_sm_80_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_nqpkv_0_m_32_sm_80_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_64_nqpkv_0_m_16_sm_80_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_64_nqpkv_0_m_32_sm_80_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_128_nqpkv_0_m_16_sm_80_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_128_nqpkv_0_m_32_sm_80_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_nqpkv_0_m_16_sm_80_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_nqpkv_0_m_32_sm_80_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_16_sm_80_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_32_sm_80_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_0_m_16_sm_80_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_0_m_32_sm_80_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_nqpkv_0_m_16_sm_80_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_nqpkv_0_m_32_sm_80_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_64_nqpkv_0_m_16_sm_80_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_64_nqpkv_0_m_32_sm_80_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_128_nqpkv_0_m_16_sm_80_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_128_nqpkv_0_m_32_sm_80_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_nqpkv_0_m_16_sm_80_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_nqpkv_0_m_32_sm_80_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_16_sm_80_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_32_sm_80_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_0_m_16_sm_80_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_0_m_32_sm_80_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_nqpkv_16_m_16_sm_80_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_64_nqpkv_16_m_16_sm_80_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_128_nqpkv_16_m_16_sm_80_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_nqpkv_16_m_16_sm_80_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_16_m_16_sm_80_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_16_m_16_sm_80_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_nqpkv_16_m_16_sm_80_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_pagedKV_64_nqpkv_16_m_16_sm_80_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_pagedKV_128_nqpkv_16_m_16_sm_80_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_nqpkv_16_m_16_sm_80_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_pagedKV_64_nqpkv_16_m_16_sm_80_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_pagedKV_128_nqpkv_16_m_16_sm_80_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_nqpkv_16_m_16_sm_80_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_64_nqpkv_16_m_16_sm_80_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_128_nqpkv_16_m_16_sm_80_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_nqpkv_16_m_16_sm_80_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_16_m_16_sm_80_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_16_m_16_sm_80_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_nqpkv_16_m_16_sm_80_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_pagedKV_64_nqpkv_16_m_16_sm_80_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_pagedKV_128_nqpkv_16_m_16_sm_80_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_nqpkv_16_m_16_sm_80_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_pagedKV_64_nqpkv_16_m_16_sm_80_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_pagedKV_128_nqpkv_16_m_16_sm_80_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_256_beam_4_kvt_fp16_pagedKV_64_nqpkv_1_m_1_sm_80_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_256_beam_4_kvt_fp16_pagedKV_128_nqpkv_1_m_1_sm_80_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_256_beam_4_kvt_int8_pagedKV_64_nqpkv_1_m_1_sm_80_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_256_beam_4_kvt_int8_pagedKV_128_nqpkv_1_m_1_sm_80_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_256_beam_4_kvt_bf16_pagedKV_64_nqpkv_1_m_1_sm_80_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_256_beam_4_kvt_bf16_pagedKV_128_nqpkv_1_m_1_sm_80_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_256_beam_4_kvt_int8_pagedKV_64_nqpkv_1_m_1_sm_80_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_256_beam_4_kvt_int8_pagedKV_128_nqpkv_1_m_1_sm_80_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_64_beam_1_kvt_fp16_pagedKV_64_nqpkv_0_m_16_sm_80_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_64_beam_1_kvt_fp16_pagedKV_64_nqpkv_0_m_32_sm_80_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_64_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_16_sm_80_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_64_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_32_sm_80_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_64_beam_1_kvt_bf16_pagedKV_64_nqpkv_0_m_16_sm_80_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_64_beam_1_kvt_bf16_pagedKV_64_nqpkv_0_m_32_sm_80_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_64_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_16_sm_80_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_64_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_32_sm_80_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_32_nqpkv_0_m_16_sm_80_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_32_nqpkv_0_m_32_sm_80_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_16_sm_80_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_32_sm_80_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_32_nqpkv_0_m_16_sm_80_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_32_nqpkv_0_m_32_sm_80_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_16_sm_80_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_32_sm_80_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_64_beam_1_kvt_fp16_pagedKV_32_nqpkv_0_m_16_sm_80_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_64_beam_1_kvt_fp16_pagedKV_32_nqpkv_0_m_32_sm_80_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_64_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_16_sm_80_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_64_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_32_sm_80_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_64_beam_1_kvt_bf16_pagedKV_32_nqpkv_0_m_16_sm_80_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_64_beam_1_kvt_bf16_pagedKV_32_nqpkv_0_m_32_sm_80_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_64_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_16_sm_80_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_64_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_32_sm_80_cubin_len; -#endif - -#ifndef EXCLUDE_SM_86 -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_nqpkv_8_m_8_sm_86_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_64_nqpkv_8_m_8_sm_86_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_128_nqpkv_8_m_8_sm_86_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_nqpkv_8_m_8_sm_86_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_8_m_8_sm_86_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_8_m_8_sm_86_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_nqpkv_8_m_8_sm_86_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_pagedKV_64_nqpkv_8_m_8_sm_86_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_pagedKV_128_nqpkv_8_m_8_sm_86_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_nqpkv_8_m_8_sm_86_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_pagedKV_64_nqpkv_8_m_8_sm_86_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_pagedKV_128_nqpkv_8_m_8_sm_86_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_nqpkv_8_m_8_sm_86_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_64_nqpkv_8_m_8_sm_86_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_128_nqpkv_8_m_8_sm_86_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_nqpkv_8_m_8_sm_86_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_8_m_8_sm_86_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_8_m_8_sm_86_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_nqpkv_8_m_8_sm_86_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_pagedKV_64_nqpkv_8_m_8_sm_86_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_pagedKV_128_nqpkv_8_m_8_sm_86_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_nqpkv_8_m_8_sm_86_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_pagedKV_64_nqpkv_8_m_8_sm_86_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_pagedKV_128_nqpkv_8_m_8_sm_86_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_nqpkv_0_m_16_sm_86_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_nqpkv_0_m_32_sm_86_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_64_nqpkv_0_m_16_sm_86_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_64_nqpkv_0_m_32_sm_86_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_128_nqpkv_0_m_16_sm_86_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_128_nqpkv_0_m_32_sm_86_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_nqpkv_0_m_16_sm_86_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_nqpkv_0_m_32_sm_86_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_16_sm_86_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_32_sm_86_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_0_m_16_sm_86_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_0_m_32_sm_86_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_nqpkv_0_m_16_sm_86_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_nqpkv_0_m_32_sm_86_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_64_nqpkv_0_m_16_sm_86_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_64_nqpkv_0_m_32_sm_86_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_128_nqpkv_0_m_16_sm_86_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_128_nqpkv_0_m_32_sm_86_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_nqpkv_0_m_16_sm_86_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_nqpkv_0_m_32_sm_86_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_16_sm_86_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_32_sm_86_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_0_m_16_sm_86_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_0_m_32_sm_86_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_nqpkv_16_m_16_sm_86_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_64_nqpkv_16_m_16_sm_86_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_128_nqpkv_16_m_16_sm_86_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_nqpkv_16_m_16_sm_86_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_16_m_16_sm_86_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_16_m_16_sm_86_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_nqpkv_16_m_16_sm_86_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_pagedKV_64_nqpkv_16_m_16_sm_86_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_pagedKV_128_nqpkv_16_m_16_sm_86_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_nqpkv_16_m_16_sm_86_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_pagedKV_64_nqpkv_16_m_16_sm_86_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_pagedKV_128_nqpkv_16_m_16_sm_86_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_nqpkv_16_m_16_sm_86_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_64_nqpkv_16_m_16_sm_86_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_128_nqpkv_16_m_16_sm_86_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_nqpkv_16_m_16_sm_86_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_16_m_16_sm_86_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_16_m_16_sm_86_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_nqpkv_16_m_16_sm_86_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_pagedKV_64_nqpkv_16_m_16_sm_86_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_pagedKV_128_nqpkv_16_m_16_sm_86_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_nqpkv_16_m_16_sm_86_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_pagedKV_64_nqpkv_16_m_16_sm_86_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_pagedKV_128_nqpkv_16_m_16_sm_86_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_4_kvt_fp16_pagedKV_64_nqpkv_1_m_1_sm_86_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_4_kvt_fp16_pagedKV_128_nqpkv_1_m_1_sm_86_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_4_kvt_int8_pagedKV_64_nqpkv_1_m_1_sm_86_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_4_kvt_int8_pagedKV_128_nqpkv_1_m_1_sm_86_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_4_kvt_bf16_pagedKV_64_nqpkv_1_m_1_sm_86_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_4_kvt_bf16_pagedKV_128_nqpkv_1_m_1_sm_86_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_4_kvt_int8_pagedKV_64_nqpkv_1_m_1_sm_86_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_4_kvt_int8_pagedKV_128_nqpkv_1_m_1_sm_86_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_64_beam_1_kvt_fp16_pagedKV_64_nqpkv_0_m_16_sm_86_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_64_beam_1_kvt_fp16_pagedKV_64_nqpkv_0_m_32_sm_86_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_64_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_16_sm_86_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_64_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_32_sm_86_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_64_beam_1_kvt_bf16_pagedKV_64_nqpkv_0_m_16_sm_86_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_64_beam_1_kvt_bf16_pagedKV_64_nqpkv_0_m_32_sm_86_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_64_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_16_sm_86_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_64_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_32_sm_86_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_32_nqpkv_0_m_16_sm_86_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_32_nqpkv_0_m_32_sm_86_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_16_sm_86_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_32_sm_86_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_32_nqpkv_0_m_16_sm_86_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_32_nqpkv_0_m_32_sm_86_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_16_sm_86_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_32_sm_86_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_64_beam_1_kvt_fp16_pagedKV_32_nqpkv_0_m_16_sm_86_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_64_beam_1_kvt_fp16_pagedKV_32_nqpkv_0_m_32_sm_86_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_64_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_16_sm_86_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_64_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_32_sm_86_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_64_beam_1_kvt_bf16_pagedKV_32_nqpkv_0_m_16_sm_86_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_64_beam_1_kvt_bf16_pagedKV_32_nqpkv_0_m_32_sm_86_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_64_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_16_sm_86_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_64_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_32_sm_86_cubin[]; - -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_nqpkv_8_m_8_sm_86_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_64_nqpkv_8_m_8_sm_86_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_128_nqpkv_8_m_8_sm_86_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_nqpkv_8_m_8_sm_86_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_8_m_8_sm_86_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_8_m_8_sm_86_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_nqpkv_8_m_8_sm_86_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_pagedKV_64_nqpkv_8_m_8_sm_86_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_pagedKV_128_nqpkv_8_m_8_sm_86_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_nqpkv_8_m_8_sm_86_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_pagedKV_64_nqpkv_8_m_8_sm_86_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_pagedKV_128_nqpkv_8_m_8_sm_86_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_nqpkv_8_m_8_sm_86_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_64_nqpkv_8_m_8_sm_86_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_128_nqpkv_8_m_8_sm_86_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_nqpkv_8_m_8_sm_86_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_8_m_8_sm_86_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_8_m_8_sm_86_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_nqpkv_8_m_8_sm_86_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_pagedKV_64_nqpkv_8_m_8_sm_86_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_pagedKV_128_nqpkv_8_m_8_sm_86_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_nqpkv_8_m_8_sm_86_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_pagedKV_64_nqpkv_8_m_8_sm_86_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_pagedKV_128_nqpkv_8_m_8_sm_86_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_nqpkv_0_m_16_sm_86_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_nqpkv_0_m_32_sm_86_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_64_nqpkv_0_m_16_sm_86_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_64_nqpkv_0_m_32_sm_86_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_128_nqpkv_0_m_16_sm_86_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_128_nqpkv_0_m_32_sm_86_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_nqpkv_0_m_16_sm_86_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_nqpkv_0_m_32_sm_86_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_16_sm_86_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_32_sm_86_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_0_m_16_sm_86_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_0_m_32_sm_86_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_nqpkv_0_m_16_sm_86_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_nqpkv_0_m_32_sm_86_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_64_nqpkv_0_m_16_sm_86_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_64_nqpkv_0_m_32_sm_86_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_128_nqpkv_0_m_16_sm_86_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_128_nqpkv_0_m_32_sm_86_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_nqpkv_0_m_16_sm_86_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_nqpkv_0_m_32_sm_86_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_16_sm_86_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_32_sm_86_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_0_m_16_sm_86_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_0_m_32_sm_86_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_nqpkv_16_m_16_sm_86_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_64_nqpkv_16_m_16_sm_86_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_128_nqpkv_16_m_16_sm_86_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_nqpkv_16_m_16_sm_86_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_16_m_16_sm_86_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_16_m_16_sm_86_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_nqpkv_16_m_16_sm_86_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_pagedKV_64_nqpkv_16_m_16_sm_86_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_pagedKV_128_nqpkv_16_m_16_sm_86_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_nqpkv_16_m_16_sm_86_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_pagedKV_64_nqpkv_16_m_16_sm_86_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_pagedKV_128_nqpkv_16_m_16_sm_86_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_nqpkv_16_m_16_sm_86_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_64_nqpkv_16_m_16_sm_86_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_128_nqpkv_16_m_16_sm_86_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_nqpkv_16_m_16_sm_86_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_16_m_16_sm_86_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_16_m_16_sm_86_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_nqpkv_16_m_16_sm_86_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_pagedKV_64_nqpkv_16_m_16_sm_86_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_pagedKV_128_nqpkv_16_m_16_sm_86_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_nqpkv_16_m_16_sm_86_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_pagedKV_64_nqpkv_16_m_16_sm_86_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_pagedKV_128_nqpkv_16_m_16_sm_86_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_256_beam_4_kvt_fp16_pagedKV_64_nqpkv_1_m_1_sm_86_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_256_beam_4_kvt_fp16_pagedKV_128_nqpkv_1_m_1_sm_86_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_256_beam_4_kvt_int8_pagedKV_64_nqpkv_1_m_1_sm_86_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_256_beam_4_kvt_int8_pagedKV_128_nqpkv_1_m_1_sm_86_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_256_beam_4_kvt_bf16_pagedKV_64_nqpkv_1_m_1_sm_86_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_256_beam_4_kvt_bf16_pagedKV_128_nqpkv_1_m_1_sm_86_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_256_beam_4_kvt_int8_pagedKV_64_nqpkv_1_m_1_sm_86_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_256_beam_4_kvt_int8_pagedKV_128_nqpkv_1_m_1_sm_86_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_64_beam_1_kvt_fp16_pagedKV_64_nqpkv_0_m_16_sm_86_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_64_beam_1_kvt_fp16_pagedKV_64_nqpkv_0_m_32_sm_86_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_64_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_16_sm_86_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_64_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_32_sm_86_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_64_beam_1_kvt_bf16_pagedKV_64_nqpkv_0_m_16_sm_86_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_64_beam_1_kvt_bf16_pagedKV_64_nqpkv_0_m_32_sm_86_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_64_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_16_sm_86_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_64_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_32_sm_86_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_32_nqpkv_0_m_16_sm_86_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_32_nqpkv_0_m_32_sm_86_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_16_sm_86_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_32_sm_86_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_32_nqpkv_0_m_16_sm_86_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_32_nqpkv_0_m_32_sm_86_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_16_sm_86_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_32_sm_86_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_64_beam_1_kvt_fp16_pagedKV_32_nqpkv_0_m_16_sm_86_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_64_beam_1_kvt_fp16_pagedKV_32_nqpkv_0_m_32_sm_86_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_64_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_16_sm_86_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_64_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_32_sm_86_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_64_beam_1_kvt_bf16_pagedKV_32_nqpkv_0_m_16_sm_86_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_64_beam_1_kvt_bf16_pagedKV_32_nqpkv_0_m_32_sm_86_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_64_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_16_sm_86_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_64_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_32_sm_86_cubin_len; -#endif - -#ifndef EXCLUDE_SM_89 -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_nqpkv_8_m_8_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_64_nqpkv_8_m_8_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_128_nqpkv_8_m_8_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_nqpkv_8_m_8_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_8_m_8_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_8_m_8_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_nqpkv_8_m_8_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_64_nqpkv_8_m_8_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_128_nqpkv_8_m_8_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_nqpkv_8_m_8_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_pagedKV_64_nqpkv_8_m_8_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_pagedKV_128_nqpkv_8_m_8_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_nqpkv_8_m_8_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_pagedKV_64_nqpkv_8_m_8_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_pagedKV_128_nqpkv_8_m_8_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_1_kvt_e4m3_nqpkv_8_m_8_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_1_kvt_e4m3_pagedKV_64_nqpkv_8_m_8_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_1_kvt_e4m3_pagedKV_128_nqpkv_8_m_8_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_nqpkv_8_m_8_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_64_nqpkv_8_m_8_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_128_nqpkv_8_m_8_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_nqpkv_8_m_8_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_8_m_8_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_8_m_8_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_nqpkv_8_m_8_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_64_nqpkv_8_m_8_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_128_nqpkv_8_m_8_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_nqpkv_8_m_8_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_pagedKV_64_nqpkv_8_m_8_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_pagedKV_128_nqpkv_8_m_8_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_nqpkv_8_m_8_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_pagedKV_64_nqpkv_8_m_8_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_pagedKV_128_nqpkv_8_m_8_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_1_kvt_e4m3_nqpkv_8_m_8_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_1_kvt_e4m3_pagedKV_64_nqpkv_8_m_8_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_1_kvt_e4m3_pagedKV_128_nqpkv_8_m_8_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_nqpkv_0_m_16_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_nqpkv_0_m_32_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_64_nqpkv_0_m_16_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_64_nqpkv_0_m_32_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_128_nqpkv_0_m_16_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_128_nqpkv_0_m_32_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_nqpkv_0_m_16_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_nqpkv_0_m_32_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_16_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_32_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_0_m_16_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_0_m_32_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_nqpkv_0_m_16_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_nqpkv_0_m_32_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_64_nqpkv_0_m_16_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_64_nqpkv_0_m_32_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_128_nqpkv_0_m_16_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_128_nqpkv_0_m_32_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_nqpkv_0_m_16_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_nqpkv_0_m_32_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_64_nqpkv_0_m_16_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_64_nqpkv_0_m_32_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_128_nqpkv_0_m_16_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_128_nqpkv_0_m_32_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_nqpkv_0_m_16_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_nqpkv_0_m_32_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_16_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_32_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_0_m_16_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_0_m_32_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_nqpkv_0_m_16_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_nqpkv_0_m_32_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_64_nqpkv_0_m_16_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_64_nqpkv_0_m_32_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_128_nqpkv_0_m_16_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_128_nqpkv_0_m_32_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_nqpkv_16_m_16_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_64_nqpkv_16_m_16_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_128_nqpkv_16_m_16_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_nqpkv_16_m_16_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_16_m_16_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_16_m_16_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_nqpkv_16_m_16_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_64_nqpkv_16_m_16_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_128_nqpkv_16_m_16_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_nqpkv_16_m_16_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_pagedKV_64_nqpkv_16_m_16_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_pagedKV_128_nqpkv_16_m_16_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_nqpkv_16_m_16_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_pagedKV_64_nqpkv_16_m_16_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_pagedKV_128_nqpkv_16_m_16_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_1_kvt_e4m3_nqpkv_16_m_16_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_1_kvt_e4m3_pagedKV_64_nqpkv_16_m_16_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_1_kvt_e4m3_pagedKV_128_nqpkv_16_m_16_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_nqpkv_16_m_16_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_64_nqpkv_16_m_16_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_128_nqpkv_16_m_16_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_nqpkv_16_m_16_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_16_m_16_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_16_m_16_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_nqpkv_16_m_16_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_64_nqpkv_16_m_16_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_128_nqpkv_16_m_16_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_nqpkv_16_m_16_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_pagedKV_64_nqpkv_16_m_16_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_pagedKV_128_nqpkv_16_m_16_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_nqpkv_16_m_16_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_pagedKV_64_nqpkv_16_m_16_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_pagedKV_128_nqpkv_16_m_16_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_1_kvt_e4m3_nqpkv_16_m_16_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_1_kvt_e4m3_pagedKV_64_nqpkv_16_m_16_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_1_kvt_e4m3_pagedKV_128_nqpkv_16_m_16_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_4_kvt_fp16_pagedKV_64_nqpkv_1_m_1_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_4_kvt_fp16_pagedKV_128_nqpkv_1_m_1_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_4_kvt_int8_pagedKV_64_nqpkv_1_m_1_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_4_kvt_int8_pagedKV_128_nqpkv_1_m_1_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_4_kvt_e4m3_pagedKV_64_nqpkv_1_m_1_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_4_kvt_e4m3_pagedKV_128_nqpkv_1_m_1_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_4_kvt_bf16_pagedKV_64_nqpkv_1_m_1_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_4_kvt_bf16_pagedKV_128_nqpkv_1_m_1_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_4_kvt_int8_pagedKV_64_nqpkv_1_m_1_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_4_kvt_int8_pagedKV_128_nqpkv_1_m_1_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_4_kvt_e4m3_pagedKV_64_nqpkv_1_m_1_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_4_kvt_e4m3_pagedKV_128_nqpkv_1_m_1_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_64_beam_1_kvt_fp16_pagedKV_64_nqpkv_0_m_16_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_64_beam_1_kvt_fp16_pagedKV_64_nqpkv_0_m_32_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_64_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_16_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_64_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_32_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_64_beam_1_kvt_e4m3_pagedKV_64_nqpkv_0_m_16_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_64_beam_1_kvt_e4m3_pagedKV_64_nqpkv_0_m_32_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_64_beam_1_kvt_bf16_pagedKV_64_nqpkv_0_m_16_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_64_beam_1_kvt_bf16_pagedKV_64_nqpkv_0_m_32_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_64_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_16_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_64_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_32_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_64_beam_1_kvt_e4m3_pagedKV_64_nqpkv_0_m_16_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_64_beam_1_kvt_e4m3_pagedKV_64_nqpkv_0_m_32_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_32_nqpkv_0_m_16_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_32_nqpkv_0_m_32_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_16_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_32_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_32_nqpkv_0_m_16_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_32_nqpkv_0_m_32_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_32_nqpkv_0_m_16_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_32_nqpkv_0_m_32_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_16_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_32_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_32_nqpkv_0_m_16_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_32_nqpkv_0_m_32_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_64_beam_1_kvt_fp16_pagedKV_32_nqpkv_0_m_16_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_64_beam_1_kvt_fp16_pagedKV_32_nqpkv_0_m_32_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_64_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_16_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_64_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_32_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_64_beam_1_kvt_e4m3_pagedKV_32_nqpkv_0_m_16_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_64_beam_1_kvt_e4m3_pagedKV_32_nqpkv_0_m_32_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_64_beam_1_kvt_bf16_pagedKV_32_nqpkv_0_m_16_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_64_beam_1_kvt_bf16_pagedKV_32_nqpkv_0_m_32_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_64_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_16_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_64_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_32_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_64_beam_1_kvt_e4m3_pagedKV_32_nqpkv_0_m_16_sm_89_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_64_beam_1_kvt_e4m3_pagedKV_32_nqpkv_0_m_32_sm_89_cubin[]; - -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_nqpkv_8_m_8_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_64_nqpkv_8_m_8_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_128_nqpkv_8_m_8_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_nqpkv_8_m_8_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_8_m_8_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_8_m_8_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_nqpkv_8_m_8_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_64_nqpkv_8_m_8_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_128_nqpkv_8_m_8_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_nqpkv_8_m_8_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_pagedKV_64_nqpkv_8_m_8_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_pagedKV_128_nqpkv_8_m_8_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_nqpkv_8_m_8_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_pagedKV_64_nqpkv_8_m_8_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_pagedKV_128_nqpkv_8_m_8_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_256_beam_1_kvt_e4m3_nqpkv_8_m_8_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_256_beam_1_kvt_e4m3_pagedKV_64_nqpkv_8_m_8_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_256_beam_1_kvt_e4m3_pagedKV_128_nqpkv_8_m_8_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_nqpkv_8_m_8_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_64_nqpkv_8_m_8_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_128_nqpkv_8_m_8_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_nqpkv_8_m_8_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_8_m_8_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_8_m_8_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_nqpkv_8_m_8_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_64_nqpkv_8_m_8_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_128_nqpkv_8_m_8_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_nqpkv_8_m_8_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_pagedKV_64_nqpkv_8_m_8_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_pagedKV_128_nqpkv_8_m_8_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_nqpkv_8_m_8_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_pagedKV_64_nqpkv_8_m_8_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_pagedKV_128_nqpkv_8_m_8_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_256_beam_1_kvt_e4m3_nqpkv_8_m_8_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_256_beam_1_kvt_e4m3_pagedKV_64_nqpkv_8_m_8_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_256_beam_1_kvt_e4m3_pagedKV_128_nqpkv_8_m_8_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_nqpkv_0_m_16_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_nqpkv_0_m_32_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_64_nqpkv_0_m_16_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_64_nqpkv_0_m_32_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_128_nqpkv_0_m_16_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_128_nqpkv_0_m_32_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_nqpkv_0_m_16_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_nqpkv_0_m_32_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_16_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_32_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_0_m_16_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_0_m_32_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_nqpkv_0_m_16_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_nqpkv_0_m_32_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_64_nqpkv_0_m_16_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_64_nqpkv_0_m_32_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_128_nqpkv_0_m_16_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_128_nqpkv_0_m_32_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_nqpkv_0_m_16_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_nqpkv_0_m_32_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_64_nqpkv_0_m_16_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_64_nqpkv_0_m_32_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_128_nqpkv_0_m_16_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_128_nqpkv_0_m_32_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_nqpkv_0_m_16_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_nqpkv_0_m_32_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_16_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_32_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_0_m_16_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_0_m_32_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_nqpkv_0_m_16_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_nqpkv_0_m_32_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_64_nqpkv_0_m_16_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_64_nqpkv_0_m_32_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_128_nqpkv_0_m_16_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_128_nqpkv_0_m_32_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_nqpkv_16_m_16_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_64_nqpkv_16_m_16_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_128_nqpkv_16_m_16_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_nqpkv_16_m_16_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_16_m_16_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_16_m_16_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_nqpkv_16_m_16_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_64_nqpkv_16_m_16_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_128_nqpkv_16_m_16_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_nqpkv_16_m_16_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_pagedKV_64_nqpkv_16_m_16_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_pagedKV_128_nqpkv_16_m_16_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_nqpkv_16_m_16_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_pagedKV_64_nqpkv_16_m_16_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_pagedKV_128_nqpkv_16_m_16_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_256_beam_1_kvt_e4m3_nqpkv_16_m_16_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_256_beam_1_kvt_e4m3_pagedKV_64_nqpkv_16_m_16_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_256_beam_1_kvt_e4m3_pagedKV_128_nqpkv_16_m_16_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_nqpkv_16_m_16_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_64_nqpkv_16_m_16_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_128_nqpkv_16_m_16_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_nqpkv_16_m_16_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_16_m_16_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_16_m_16_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_nqpkv_16_m_16_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_64_nqpkv_16_m_16_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_128_nqpkv_16_m_16_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_nqpkv_16_m_16_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_pagedKV_64_nqpkv_16_m_16_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_pagedKV_128_nqpkv_16_m_16_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_nqpkv_16_m_16_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_pagedKV_64_nqpkv_16_m_16_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_pagedKV_128_nqpkv_16_m_16_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_256_beam_1_kvt_e4m3_nqpkv_16_m_16_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_256_beam_1_kvt_e4m3_pagedKV_64_nqpkv_16_m_16_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_256_beam_1_kvt_e4m3_pagedKV_128_nqpkv_16_m_16_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_256_beam_4_kvt_fp16_pagedKV_64_nqpkv_1_m_1_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_256_beam_4_kvt_fp16_pagedKV_128_nqpkv_1_m_1_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_256_beam_4_kvt_int8_pagedKV_64_nqpkv_1_m_1_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_256_beam_4_kvt_int8_pagedKV_128_nqpkv_1_m_1_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_256_beam_4_kvt_e4m3_pagedKV_64_nqpkv_1_m_1_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_256_beam_4_kvt_e4m3_pagedKV_128_nqpkv_1_m_1_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_256_beam_4_kvt_bf16_pagedKV_64_nqpkv_1_m_1_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_256_beam_4_kvt_bf16_pagedKV_128_nqpkv_1_m_1_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_256_beam_4_kvt_int8_pagedKV_64_nqpkv_1_m_1_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_256_beam_4_kvt_int8_pagedKV_128_nqpkv_1_m_1_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_256_beam_4_kvt_e4m3_pagedKV_64_nqpkv_1_m_1_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_256_beam_4_kvt_e4m3_pagedKV_128_nqpkv_1_m_1_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_64_beam_1_kvt_fp16_pagedKV_64_nqpkv_0_m_16_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_64_beam_1_kvt_fp16_pagedKV_64_nqpkv_0_m_32_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_64_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_16_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_64_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_32_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_64_beam_1_kvt_e4m3_pagedKV_64_nqpkv_0_m_16_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_64_beam_1_kvt_e4m3_pagedKV_64_nqpkv_0_m_32_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_64_beam_1_kvt_bf16_pagedKV_64_nqpkv_0_m_16_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_64_beam_1_kvt_bf16_pagedKV_64_nqpkv_0_m_32_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_64_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_16_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_64_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_32_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_64_beam_1_kvt_e4m3_pagedKV_64_nqpkv_0_m_16_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_64_beam_1_kvt_e4m3_pagedKV_64_nqpkv_0_m_32_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_32_nqpkv_0_m_16_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_32_nqpkv_0_m_32_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_16_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_32_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_32_nqpkv_0_m_16_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_32_nqpkv_0_m_32_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_32_nqpkv_0_m_16_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_32_nqpkv_0_m_32_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_16_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_32_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_32_nqpkv_0_m_16_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_32_nqpkv_0_m_32_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_64_beam_1_kvt_fp16_pagedKV_32_nqpkv_0_m_16_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_64_beam_1_kvt_fp16_pagedKV_32_nqpkv_0_m_32_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_64_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_16_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_64_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_32_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_64_beam_1_kvt_e4m3_pagedKV_32_nqpkv_0_m_16_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_64_beam_1_kvt_e4m3_pagedKV_32_nqpkv_0_m_32_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_64_beam_1_kvt_bf16_pagedKV_32_nqpkv_0_m_16_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_64_beam_1_kvt_bf16_pagedKV_32_nqpkv_0_m_32_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_64_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_16_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_64_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_32_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_64_beam_1_kvt_e4m3_pagedKV_32_nqpkv_0_m_16_sm_89_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_64_beam_1_kvt_e4m3_pagedKV_32_nqpkv_0_m_32_sm_89_cubin_len; -#endif - -#ifndef EXCLUDE_SM_90 -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_nqpkv_8_m_8_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_64_nqpkv_8_m_8_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_128_nqpkv_8_m_8_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_nqpkv_8_m_8_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_8_m_8_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_8_m_8_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_nqpkv_8_m_8_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_64_nqpkv_8_m_8_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_128_nqpkv_8_m_8_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_nqpkv_8_m_8_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_pagedKV_64_nqpkv_8_m_8_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_pagedKV_128_nqpkv_8_m_8_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_nqpkv_8_m_8_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_pagedKV_64_nqpkv_8_m_8_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_pagedKV_128_nqpkv_8_m_8_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_1_kvt_e4m3_nqpkv_8_m_8_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_1_kvt_e4m3_pagedKV_64_nqpkv_8_m_8_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_1_kvt_e4m3_pagedKV_128_nqpkv_8_m_8_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_nqpkv_8_m_8_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_64_nqpkv_8_m_8_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_128_nqpkv_8_m_8_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_nqpkv_8_m_8_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_8_m_8_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_8_m_8_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_nqpkv_8_m_8_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_64_nqpkv_8_m_8_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_128_nqpkv_8_m_8_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_nqpkv_8_m_8_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_pagedKV_64_nqpkv_8_m_8_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_pagedKV_128_nqpkv_8_m_8_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_nqpkv_8_m_8_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_pagedKV_64_nqpkv_8_m_8_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_pagedKV_128_nqpkv_8_m_8_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_1_kvt_e4m3_nqpkv_8_m_8_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_1_kvt_e4m3_pagedKV_64_nqpkv_8_m_8_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_1_kvt_e4m3_pagedKV_128_nqpkv_8_m_8_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_nqpkv_16_m_16_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_64_nqpkv_16_m_16_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_128_nqpkv_16_m_16_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_nqpkv_16_m_16_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_16_m_16_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_16_m_16_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_nqpkv_16_m_16_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_64_nqpkv_16_m_16_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_128_nqpkv_16_m_16_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_nqpkv_16_m_16_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_pagedKV_64_nqpkv_16_m_16_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_pagedKV_128_nqpkv_16_m_16_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_nqpkv_16_m_16_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_pagedKV_64_nqpkv_16_m_16_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_pagedKV_128_nqpkv_16_m_16_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_1_kvt_e4m3_nqpkv_16_m_16_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_1_kvt_e4m3_pagedKV_64_nqpkv_16_m_16_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_1_kvt_e4m3_pagedKV_128_nqpkv_16_m_16_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_nqpkv_16_m_16_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_64_nqpkv_16_m_16_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_128_nqpkv_16_m_16_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_nqpkv_16_m_16_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_16_m_16_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_16_m_16_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_nqpkv_16_m_16_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_64_nqpkv_16_m_16_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_128_nqpkv_16_m_16_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_nqpkv_16_m_16_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_pagedKV_64_nqpkv_16_m_16_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_pagedKV_128_nqpkv_16_m_16_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_nqpkv_16_m_16_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_pagedKV_64_nqpkv_16_m_16_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_pagedKV_128_nqpkv_16_m_16_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_1_kvt_e4m3_nqpkv_16_m_16_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_1_kvt_e4m3_pagedKV_64_nqpkv_16_m_16_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_1_kvt_e4m3_pagedKV_128_nqpkv_16_m_16_sm_90_cubin[]; - -// MultiQueryToken kernels. -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_nqpkv_0_m_16_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_nqpkv_0_m_32_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_64_nqpkv_0_m_16_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_64_nqpkv_0_m_32_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_128_nqpkv_0_m_16_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_128_nqpkv_0_m_32_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_nqpkv_0_m_16_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_nqpkv_0_m_32_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_16_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_32_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_0_m_16_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_0_m_32_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_nqpkv_0_m_16_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_nqpkv_0_m_32_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_64_nqpkv_0_m_16_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_64_nqpkv_0_m_32_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_128_nqpkv_0_m_16_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_128_nqpkv_0_m_32_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_nqpkv_0_m_16_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_nqpkv_0_m_32_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_64_nqpkv_0_m_16_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_64_nqpkv_0_m_32_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_128_nqpkv_0_m_16_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_128_nqpkv_0_m_32_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_nqpkv_0_m_16_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_nqpkv_0_m_32_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_16_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_32_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_0_m_16_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_0_m_32_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_nqpkv_0_m_16_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_nqpkv_0_m_32_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_64_nqpkv_0_m_16_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_64_nqpkv_0_m_32_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_128_nqpkv_0_m_16_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_128_nqpkv_0_m_32_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_64_beam_1_kvt_fp16_pagedKV_64_nqpkv_0_m_16_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_64_beam_1_kvt_fp16_pagedKV_64_nqpkv_0_m_32_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_64_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_16_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_64_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_32_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_64_beam_1_kvt_e4m3_pagedKV_64_nqpkv_0_m_16_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_64_beam_1_kvt_e4m3_pagedKV_64_nqpkv_0_m_32_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_64_beam_1_kvt_bf16_pagedKV_64_nqpkv_0_m_16_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_64_beam_1_kvt_bf16_pagedKV_64_nqpkv_0_m_32_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_64_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_16_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_64_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_32_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_64_beam_1_kvt_e4m3_pagedKV_64_nqpkv_0_m_16_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_64_beam_1_kvt_e4m3_pagedKV_64_nqpkv_0_m_32_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_32_nqpkv_0_m_16_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_32_nqpkv_0_m_32_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_16_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_32_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_32_nqpkv_0_m_16_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_32_nqpkv_0_m_32_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_32_nqpkv_0_m_16_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_32_nqpkv_0_m_32_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_16_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_32_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_32_nqpkv_0_m_16_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_32_nqpkv_0_m_32_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_64_beam_1_kvt_fp16_pagedKV_32_nqpkv_0_m_16_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_64_beam_1_kvt_fp16_pagedKV_32_nqpkv_0_m_32_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_64_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_16_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_64_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_32_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_64_beam_1_kvt_e4m3_pagedKV_32_nqpkv_0_m_16_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_64_beam_1_kvt_e4m3_pagedKV_32_nqpkv_0_m_32_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_64_beam_1_kvt_bf16_pagedKV_32_nqpkv_0_m_16_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_64_beam_1_kvt_bf16_pagedKV_32_nqpkv_0_m_32_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_64_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_16_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_64_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_32_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_64_beam_1_kvt_e4m3_pagedKV_32_nqpkv_0_m_16_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_64_beam_1_kvt_e4m3_pagedKV_32_nqpkv_0_m_32_sm_90_cubin[]; - -// MHA with beamWidth=4 -extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_4_kvt_fp16_pagedKV_64_nqpkv_1_m_1_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_4_kvt_fp16_pagedKV_128_nqpkv_1_m_1_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_4_kvt_int8_pagedKV_64_nqpkv_1_m_1_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_4_kvt_int8_pagedKV_128_nqpkv_1_m_1_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_4_kvt_e4m3_pagedKV_64_nqpkv_1_m_1_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_4_kvt_e4m3_pagedKV_128_nqpkv_1_m_1_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_4_kvt_bf16_pagedKV_64_nqpkv_1_m_1_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_4_kvt_bf16_pagedKV_128_nqpkv_1_m_1_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_4_kvt_int8_pagedKV_64_nqpkv_1_m_1_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_4_kvt_int8_pagedKV_128_nqpkv_1_m_1_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_4_kvt_e4m3_pagedKV_64_nqpkv_1_m_1_sm_90_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_4_kvt_e4m3_pagedKV_128_nqpkv_1_m_1_sm_90_cubin[]; - -// SingleQueryToken kernels. -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_nqpkv_8_m_8_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_64_nqpkv_8_m_8_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_128_nqpkv_8_m_8_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_nqpkv_8_m_8_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_8_m_8_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_8_m_8_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_nqpkv_8_m_8_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_64_nqpkv_8_m_8_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_128_nqpkv_8_m_8_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_nqpkv_8_m_8_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_pagedKV_64_nqpkv_8_m_8_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_pagedKV_128_nqpkv_8_m_8_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_nqpkv_8_m_8_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_pagedKV_64_nqpkv_8_m_8_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_pagedKV_128_nqpkv_8_m_8_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_256_beam_1_kvt_e4m3_nqpkv_8_m_8_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_256_beam_1_kvt_e4m3_pagedKV_64_nqpkv_8_m_8_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_256_beam_1_kvt_e4m3_pagedKV_128_nqpkv_8_m_8_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_nqpkv_8_m_8_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_64_nqpkv_8_m_8_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_128_nqpkv_8_m_8_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_nqpkv_8_m_8_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_8_m_8_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_8_m_8_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_nqpkv_8_m_8_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_64_nqpkv_8_m_8_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_128_nqpkv_8_m_8_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_nqpkv_8_m_8_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_pagedKV_64_nqpkv_8_m_8_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_pagedKV_128_nqpkv_8_m_8_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_nqpkv_8_m_8_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_pagedKV_64_nqpkv_8_m_8_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_pagedKV_128_nqpkv_8_m_8_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_256_beam_1_kvt_e4m3_nqpkv_8_m_8_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_256_beam_1_kvt_e4m3_pagedKV_64_nqpkv_8_m_8_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_256_beam_1_kvt_e4m3_pagedKV_128_nqpkv_8_m_8_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_nqpkv_16_m_16_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_64_nqpkv_16_m_16_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_128_nqpkv_16_m_16_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_nqpkv_16_m_16_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_16_m_16_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_16_m_16_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_nqpkv_16_m_16_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_64_nqpkv_16_m_16_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_128_nqpkv_16_m_16_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_nqpkv_16_m_16_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_pagedKV_64_nqpkv_16_m_16_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_pagedKV_128_nqpkv_16_m_16_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_nqpkv_16_m_16_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_pagedKV_64_nqpkv_16_m_16_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_pagedKV_128_nqpkv_16_m_16_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_256_beam_1_kvt_e4m3_nqpkv_16_m_16_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_256_beam_1_kvt_e4m3_pagedKV_64_nqpkv_16_m_16_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_256_beam_1_kvt_e4m3_pagedKV_128_nqpkv_16_m_16_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_nqpkv_16_m_16_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_64_nqpkv_16_m_16_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_128_nqpkv_16_m_16_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_nqpkv_16_m_16_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_16_m_16_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_16_m_16_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_nqpkv_16_m_16_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_64_nqpkv_16_m_16_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_128_nqpkv_16_m_16_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_nqpkv_16_m_16_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_pagedKV_64_nqpkv_16_m_16_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_pagedKV_128_nqpkv_16_m_16_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_nqpkv_16_m_16_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_pagedKV_64_nqpkv_16_m_16_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_pagedKV_128_nqpkv_16_m_16_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_256_beam_1_kvt_e4m3_nqpkv_16_m_16_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_256_beam_1_kvt_e4m3_pagedKV_64_nqpkv_16_m_16_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_256_beam_1_kvt_e4m3_pagedKV_128_nqpkv_16_m_16_sm_90_cubin_len; - -// MultiQueryToken kernels. -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_nqpkv_0_m_16_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_nqpkv_0_m_32_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_64_nqpkv_0_m_16_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_64_nqpkv_0_m_32_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_128_nqpkv_0_m_16_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_128_nqpkv_0_m_32_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_nqpkv_0_m_16_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_nqpkv_0_m_32_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_16_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_32_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_0_m_16_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_0_m_32_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_nqpkv_0_m_16_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_nqpkv_0_m_32_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_64_nqpkv_0_m_16_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_64_nqpkv_0_m_32_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_128_nqpkv_0_m_16_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_128_nqpkv_0_m_32_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_nqpkv_0_m_16_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_nqpkv_0_m_32_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_64_nqpkv_0_m_16_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_64_nqpkv_0_m_32_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_128_nqpkv_0_m_16_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_128_nqpkv_0_m_32_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_nqpkv_0_m_16_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_nqpkv_0_m_32_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_16_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_32_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_0_m_16_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_0_m_32_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_nqpkv_0_m_16_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_nqpkv_0_m_32_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_64_nqpkv_0_m_16_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_64_nqpkv_0_m_32_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_128_nqpkv_0_m_16_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_128_nqpkv_0_m_32_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_64_beam_1_kvt_fp16_pagedKV_64_nqpkv_0_m_16_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_64_beam_1_kvt_fp16_pagedKV_64_nqpkv_0_m_32_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_64_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_16_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_64_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_32_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_64_beam_1_kvt_e4m3_pagedKV_64_nqpkv_0_m_16_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_64_beam_1_kvt_e4m3_pagedKV_64_nqpkv_0_m_32_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_64_beam_1_kvt_bf16_pagedKV_64_nqpkv_0_m_16_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_64_beam_1_kvt_bf16_pagedKV_64_nqpkv_0_m_32_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_64_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_16_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_64_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_32_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_64_beam_1_kvt_e4m3_pagedKV_64_nqpkv_0_m_16_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_64_beam_1_kvt_e4m3_pagedKV_64_nqpkv_0_m_32_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_32_nqpkv_0_m_16_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_32_nqpkv_0_m_32_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_16_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_32_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_32_nqpkv_0_m_16_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_32_nqpkv_0_m_32_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_32_nqpkv_0_m_16_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_32_nqpkv_0_m_32_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_16_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_32_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_32_nqpkv_0_m_16_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_32_nqpkv_0_m_32_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_64_beam_1_kvt_fp16_pagedKV_32_nqpkv_0_m_16_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_64_beam_1_kvt_fp16_pagedKV_32_nqpkv_0_m_32_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_64_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_16_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_64_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_32_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_64_beam_1_kvt_e4m3_pagedKV_32_nqpkv_0_m_16_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_64_beam_1_kvt_e4m3_pagedKV_32_nqpkv_0_m_32_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_64_beam_1_kvt_bf16_pagedKV_32_nqpkv_0_m_16_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_64_beam_1_kvt_bf16_pagedKV_32_nqpkv_0_m_32_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_64_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_16_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_64_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_32_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_64_beam_1_kvt_e4m3_pagedKV_32_nqpkv_0_m_16_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_64_beam_1_kvt_e4m3_pagedKV_32_nqpkv_0_m_32_sm_90_cubin_len; - -// MHA with beamWidth=4 -extern uint32_t xqa_kernel_dt_fp16_d_256_beam_4_kvt_fp16_pagedKV_64_nqpkv_1_m_1_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_256_beam_4_kvt_fp16_pagedKV_128_nqpkv_1_m_1_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_256_beam_4_kvt_int8_pagedKV_64_nqpkv_1_m_1_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_256_beam_4_kvt_int8_pagedKV_128_nqpkv_1_m_1_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_256_beam_4_kvt_e4m3_pagedKV_64_nqpkv_1_m_1_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_256_beam_4_kvt_e4m3_pagedKV_128_nqpkv_1_m_1_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_256_beam_4_kvt_bf16_pagedKV_64_nqpkv_1_m_1_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_256_beam_4_kvt_bf16_pagedKV_128_nqpkv_1_m_1_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_256_beam_4_kvt_int8_pagedKV_64_nqpkv_1_m_1_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_256_beam_4_kvt_int8_pagedKV_128_nqpkv_1_m_1_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_256_beam_4_kvt_e4m3_pagedKV_64_nqpkv_1_m_1_sm_90_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_256_beam_4_kvt_e4m3_pagedKV_128_nqpkv_1_m_1_sm_90_cubin_len; -#endif - -#ifndef EXCLUDE_SM_120 -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_nqpkv_8_m_8_sm_120_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_16_nqpkv_8_m_8_sm_120_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_32_nqpkv_8_m_8_sm_120_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_64_nqpkv_8_m_8_sm_120_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_128_nqpkv_8_m_8_sm_120_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_nqpkv_8_m_8_sm_120_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_16_nqpkv_8_m_8_sm_120_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_32_nqpkv_8_m_8_sm_120_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_8_m_8_sm_120_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_8_m_8_sm_120_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_nqpkv_8_m_8_sm_120_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_16_nqpkv_8_m_8_sm_120_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_32_nqpkv_8_m_8_sm_120_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_64_nqpkv_8_m_8_sm_120_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_128_nqpkv_8_m_8_sm_120_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_nqpkv_8_m_8_sm_120_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_pagedKV_16_nqpkv_8_m_8_sm_120_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_pagedKV_32_nqpkv_8_m_8_sm_120_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_pagedKV_64_nqpkv_8_m_8_sm_120_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_pagedKV_128_nqpkv_8_m_8_sm_120_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_nqpkv_8_m_8_sm_120_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_pagedKV_16_nqpkv_8_m_8_sm_120_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_pagedKV_32_nqpkv_8_m_8_sm_120_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_pagedKV_64_nqpkv_8_m_8_sm_120_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_pagedKV_128_nqpkv_8_m_8_sm_120_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_1_kvt_e4m3_nqpkv_8_m_8_sm_120_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_1_kvt_e4m3_pagedKV_16_nqpkv_8_m_8_sm_120_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_1_kvt_e4m3_pagedKV_32_nqpkv_8_m_8_sm_120_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_1_kvt_e4m3_pagedKV_64_nqpkv_8_m_8_sm_120_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_1_kvt_e4m3_pagedKV_128_nqpkv_8_m_8_sm_120_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_nqpkv_8_m_8_sm_120_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_16_nqpkv_8_m_8_sm_120_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_32_nqpkv_8_m_8_sm_120_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_64_nqpkv_8_m_8_sm_120_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_128_nqpkv_8_m_8_sm_120_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_nqpkv_8_m_8_sm_120_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_16_nqpkv_8_m_8_sm_120_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_32_nqpkv_8_m_8_sm_120_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_8_m_8_sm_120_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_8_m_8_sm_120_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_nqpkv_8_m_8_sm_120_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_16_nqpkv_8_m_8_sm_120_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_32_nqpkv_8_m_8_sm_120_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_64_nqpkv_8_m_8_sm_120_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_128_nqpkv_8_m_8_sm_120_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_nqpkv_8_m_8_sm_120_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_pagedKV_16_nqpkv_8_m_8_sm_120_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_pagedKV_32_nqpkv_8_m_8_sm_120_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_pagedKV_64_nqpkv_8_m_8_sm_120_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_pagedKV_128_nqpkv_8_m_8_sm_120_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_nqpkv_8_m_8_sm_120_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_pagedKV_16_nqpkv_8_m_8_sm_120_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_pagedKV_32_nqpkv_8_m_8_sm_120_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_pagedKV_64_nqpkv_8_m_8_sm_120_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_pagedKV_128_nqpkv_8_m_8_sm_120_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_1_kvt_e4m3_nqpkv_8_m_8_sm_120_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_1_kvt_e4m3_pagedKV_16_nqpkv_8_m_8_sm_120_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_1_kvt_e4m3_pagedKV_32_nqpkv_8_m_8_sm_120_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_1_kvt_e4m3_pagedKV_64_nqpkv_8_m_8_sm_120_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_1_kvt_e4m3_pagedKV_128_nqpkv_8_m_8_sm_120_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_4_kvt_fp16_nqpkv_1_m_4_sm_120_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_4_kvt_fp16_pagedKV_16_nqpkv_1_m_4_sm_120_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_4_kvt_fp16_pagedKV_32_nqpkv_1_m_4_sm_120_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_4_kvt_fp16_pagedKV_64_nqpkv_1_m_4_sm_120_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_4_kvt_fp16_pagedKV_128_nqpkv_1_m_4_sm_120_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_4_kvt_int8_nqpkv_1_m_4_sm_120_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_4_kvt_int8_pagedKV_16_nqpkv_1_m_4_sm_120_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_4_kvt_int8_pagedKV_32_nqpkv_1_m_4_sm_120_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_4_kvt_int8_pagedKV_64_nqpkv_1_m_4_sm_120_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_4_kvt_int8_pagedKV_128_nqpkv_1_m_4_sm_120_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_4_kvt_e4m3_nqpkv_1_m_4_sm_120_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_4_kvt_e4m3_pagedKV_16_nqpkv_1_m_4_sm_120_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_4_kvt_e4m3_pagedKV_32_nqpkv_1_m_4_sm_120_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_4_kvt_e4m3_pagedKV_64_nqpkv_1_m_4_sm_120_cubin[]; -extern unsigned long long xqa_kernel_dt_fp16_d_256_beam_4_kvt_e4m3_pagedKV_128_nqpkv_1_m_4_sm_120_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_4_kvt_bf16_nqpkv_1_m_4_sm_120_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_4_kvt_bf16_pagedKV_16_nqpkv_1_m_4_sm_120_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_4_kvt_bf16_pagedKV_32_nqpkv_1_m_4_sm_120_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_4_kvt_bf16_pagedKV_64_nqpkv_1_m_4_sm_120_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_4_kvt_bf16_pagedKV_128_nqpkv_1_m_4_sm_120_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_4_kvt_int8_nqpkv_1_m_4_sm_120_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_4_kvt_int8_pagedKV_16_nqpkv_1_m_4_sm_120_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_4_kvt_int8_pagedKV_32_nqpkv_1_m_4_sm_120_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_4_kvt_int8_pagedKV_64_nqpkv_1_m_4_sm_120_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_4_kvt_int8_pagedKV_128_nqpkv_1_m_4_sm_120_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_4_kvt_e4m3_nqpkv_1_m_4_sm_120_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_4_kvt_e4m3_pagedKV_16_nqpkv_1_m_4_sm_120_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_4_kvt_e4m3_pagedKV_32_nqpkv_1_m_4_sm_120_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_4_kvt_e4m3_pagedKV_64_nqpkv_1_m_4_sm_120_cubin[]; -extern unsigned long long xqa_kernel_dt_bf16_d_256_beam_4_kvt_e4m3_pagedKV_128_nqpkv_1_m_4_sm_120_cubin[]; - -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_nqpkv_8_m_8_sm_120_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_16_nqpkv_8_m_8_sm_120_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_32_nqpkv_8_m_8_sm_120_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_64_nqpkv_8_m_8_sm_120_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_128_nqpkv_8_m_8_sm_120_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_nqpkv_8_m_8_sm_120_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_16_nqpkv_8_m_8_sm_120_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_32_nqpkv_8_m_8_sm_120_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_8_m_8_sm_120_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_8_m_8_sm_120_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_nqpkv_8_m_8_sm_120_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_16_nqpkv_8_m_8_sm_120_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_32_nqpkv_8_m_8_sm_120_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_64_nqpkv_8_m_8_sm_120_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_128_nqpkv_8_m_8_sm_120_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_nqpkv_8_m_8_sm_120_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_pagedKV_16_nqpkv_8_m_8_sm_120_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_pagedKV_32_nqpkv_8_m_8_sm_120_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_pagedKV_64_nqpkv_8_m_8_sm_120_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_pagedKV_128_nqpkv_8_m_8_sm_120_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_nqpkv_8_m_8_sm_120_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_pagedKV_16_nqpkv_8_m_8_sm_120_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_pagedKV_32_nqpkv_8_m_8_sm_120_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_pagedKV_64_nqpkv_8_m_8_sm_120_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_pagedKV_128_nqpkv_8_m_8_sm_120_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_256_beam_1_kvt_e4m3_nqpkv_8_m_8_sm_120_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_256_beam_1_kvt_e4m3_pagedKV_16_nqpkv_8_m_8_sm_120_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_256_beam_1_kvt_e4m3_pagedKV_32_nqpkv_8_m_8_sm_120_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_256_beam_1_kvt_e4m3_pagedKV_64_nqpkv_8_m_8_sm_120_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_256_beam_1_kvt_e4m3_pagedKV_128_nqpkv_8_m_8_sm_120_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_nqpkv_8_m_8_sm_120_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_16_nqpkv_8_m_8_sm_120_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_32_nqpkv_8_m_8_sm_120_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_64_nqpkv_8_m_8_sm_120_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_128_nqpkv_8_m_8_sm_120_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_nqpkv_8_m_8_sm_120_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_16_nqpkv_8_m_8_sm_120_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_32_nqpkv_8_m_8_sm_120_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_8_m_8_sm_120_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_8_m_8_sm_120_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_nqpkv_8_m_8_sm_120_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_16_nqpkv_8_m_8_sm_120_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_32_nqpkv_8_m_8_sm_120_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_64_nqpkv_8_m_8_sm_120_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_128_nqpkv_8_m_8_sm_120_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_nqpkv_8_m_8_sm_120_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_pagedKV_16_nqpkv_8_m_8_sm_120_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_pagedKV_32_nqpkv_8_m_8_sm_120_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_pagedKV_64_nqpkv_8_m_8_sm_120_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_pagedKV_128_nqpkv_8_m_8_sm_120_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_nqpkv_8_m_8_sm_120_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_pagedKV_16_nqpkv_8_m_8_sm_120_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_pagedKV_32_nqpkv_8_m_8_sm_120_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_pagedKV_64_nqpkv_8_m_8_sm_120_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_pagedKV_128_nqpkv_8_m_8_sm_120_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_256_beam_1_kvt_e4m3_nqpkv_8_m_8_sm_120_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_256_beam_1_kvt_e4m3_pagedKV_16_nqpkv_8_m_8_sm_120_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_256_beam_1_kvt_e4m3_pagedKV_32_nqpkv_8_m_8_sm_120_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_256_beam_1_kvt_e4m3_pagedKV_64_nqpkv_8_m_8_sm_120_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_256_beam_1_kvt_e4m3_pagedKV_128_nqpkv_8_m_8_sm_120_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_256_beam_4_kvt_fp16_nqpkv_1_m_4_sm_120_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_256_beam_4_kvt_fp16_pagedKV_16_nqpkv_1_m_4_sm_120_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_256_beam_4_kvt_fp16_pagedKV_32_nqpkv_1_m_4_sm_120_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_256_beam_4_kvt_fp16_pagedKV_64_nqpkv_1_m_4_sm_120_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_256_beam_4_kvt_fp16_pagedKV_128_nqpkv_1_m_4_sm_120_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_256_beam_4_kvt_int8_nqpkv_1_m_4_sm_120_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_256_beam_4_kvt_int8_pagedKV_16_nqpkv_1_m_4_sm_120_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_256_beam_4_kvt_int8_pagedKV_32_nqpkv_1_m_4_sm_120_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_256_beam_4_kvt_int8_pagedKV_64_nqpkv_1_m_4_sm_120_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_256_beam_4_kvt_int8_pagedKV_128_nqpkv_1_m_4_sm_120_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_256_beam_4_kvt_e4m3_nqpkv_1_m_4_sm_120_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_256_beam_4_kvt_e4m3_pagedKV_16_nqpkv_1_m_4_sm_120_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_256_beam_4_kvt_e4m3_pagedKV_32_nqpkv_1_m_4_sm_120_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_256_beam_4_kvt_e4m3_pagedKV_64_nqpkv_1_m_4_sm_120_cubin_len; -extern uint32_t xqa_kernel_dt_fp16_d_256_beam_4_kvt_e4m3_pagedKV_128_nqpkv_1_m_4_sm_120_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_256_beam_4_kvt_bf16_nqpkv_1_m_4_sm_120_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_256_beam_4_kvt_bf16_pagedKV_16_nqpkv_1_m_4_sm_120_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_256_beam_4_kvt_bf16_pagedKV_32_nqpkv_1_m_4_sm_120_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_256_beam_4_kvt_bf16_pagedKV_64_nqpkv_1_m_4_sm_120_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_256_beam_4_kvt_bf16_pagedKV_128_nqpkv_1_m_4_sm_120_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_256_beam_4_kvt_int8_nqpkv_1_m_4_sm_120_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_256_beam_4_kvt_int8_pagedKV_16_nqpkv_1_m_4_sm_120_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_256_beam_4_kvt_int8_pagedKV_32_nqpkv_1_m_4_sm_120_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_256_beam_4_kvt_int8_pagedKV_64_nqpkv_1_m_4_sm_120_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_256_beam_4_kvt_int8_pagedKV_128_nqpkv_1_m_4_sm_120_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_256_beam_4_kvt_e4m3_nqpkv_1_m_4_sm_120_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_256_beam_4_kvt_e4m3_pagedKV_16_nqpkv_1_m_4_sm_120_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_256_beam_4_kvt_e4m3_pagedKV_32_nqpkv_1_m_4_sm_120_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_256_beam_4_kvt_e4m3_pagedKV_64_nqpkv_1_m_4_sm_120_cubin_len; -extern uint32_t xqa_kernel_dt_bf16_d_256_beam_4_kvt_e4m3_pagedKV_128_nqpkv_1_m_4_sm_120_cubin_len; - - -#endif - -static const struct XQAKernelMetaInfo +struct XQAKernelMetaInfo { Data_type mDataType; Data_type mKVDataType; @@ -1285,634 +36,13 @@ static const struct XQAKernelMetaInfo unsigned int mSM; const unsigned long long* mCubin; unsigned int mCubinSize; - const char* mFuncName; -} sXqaKernelMetaInfo[] = { -// SingleQueryToken kernels. -#ifndef EXCLUDE_SM_80 -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 1, 8, 8, 0, false, false, kSM_80, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_nqpkv_8_m_8_sm_80_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_nqpkv_8_m_8_sm_80_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 1, 8, 8, 64, true, false, kSM_80, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_64_nqpkv_8_m_8_sm_80_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_64_nqpkv_8_m_8_sm_80_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 1, 8, 8, 128, true, false, kSM_80, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_128_nqpkv_8_m_8_sm_80_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_128_nqpkv_8_m_8_sm_80_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 128, 1, 8, 8, 0, false, false, kSM_80, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_nqpkv_8_m_8_sm_80_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_nqpkv_8_m_8_sm_80_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 128, 1, 8, 8, 64, true, false, kSM_80, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_8_m_8_sm_80_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_8_m_8_sm_80_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 128, 1, 8, 8, 128, true, false, kSM_80, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_8_m_8_sm_80_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_8_m_8_sm_80_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 256, 1, 8, 8, 0, false, false, kSM_80, xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_nqpkv_8_m_8_sm_80_cubin, xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_nqpkv_8_m_8_sm_80_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 256, 1, 8, 8, 64, true, false, kSM_80, xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_pagedKV_64_nqpkv_8_m_8_sm_80_cubin, xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_pagedKV_64_nqpkv_8_m_8_sm_80_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 256, 1, 8, 8, 128, true, false, kSM_80, xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_pagedKV_128_nqpkv_8_m_8_sm_80_cubin, xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_pagedKV_128_nqpkv_8_m_8_sm_80_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 256, 1, 8, 8, 0, false, false, kSM_80, xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_nqpkv_8_m_8_sm_80_cubin, xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_nqpkv_8_m_8_sm_80_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 256, 1, 8, 8, 64, true, false, kSM_80, xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_pagedKV_64_nqpkv_8_m_8_sm_80_cubin, xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_pagedKV_64_nqpkv_8_m_8_sm_80_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 256, 1, 8, 8, 128, true, false, kSM_80, xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_pagedKV_128_nqpkv_8_m_8_sm_80_cubin, xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_pagedKV_128_nqpkv_8_m_8_sm_80_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 1, 8, 8, 0, false, false, kSM_80, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_nqpkv_8_m_8_sm_80_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_nqpkv_8_m_8_sm_80_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 1, 8, 8, 64, true, false, kSM_80, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_64_nqpkv_8_m_8_sm_80_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_64_nqpkv_8_m_8_sm_80_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 1, 8, 8, 128, true, false, kSM_80, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_128_nqpkv_8_m_8_sm_80_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_128_nqpkv_8_m_8_sm_80_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 128, 1, 8, 8, 0, false, false, kSM_80, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_nqpkv_8_m_8_sm_80_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_nqpkv_8_m_8_sm_80_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 128, 1, 8, 8, 64, true, false, kSM_80, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_8_m_8_sm_80_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_8_m_8_sm_80_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 128, 1, 8, 8, 128, true, false, kSM_80, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_8_m_8_sm_80_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_8_m_8_sm_80_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 256, 1, 8, 8, 0, false, false, kSM_80, xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_nqpkv_8_m_8_sm_80_cubin, xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_nqpkv_8_m_8_sm_80_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 256, 1, 8, 8, 64, true, false, kSM_80, xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_pagedKV_64_nqpkv_8_m_8_sm_80_cubin, xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_pagedKV_64_nqpkv_8_m_8_sm_80_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 256, 1, 8, 8, 128, true, false, kSM_80, xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_pagedKV_128_nqpkv_8_m_8_sm_80_cubin, xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_pagedKV_128_nqpkv_8_m_8_sm_80_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 256, 1, 8, 8, 0, false, false, kSM_80, xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_nqpkv_8_m_8_sm_80_cubin, xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_nqpkv_8_m_8_sm_80_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 256, 1, 8, 8, 64, true, false, kSM_80, xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_pagedKV_64_nqpkv_8_m_8_sm_80_cubin, xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_pagedKV_64_nqpkv_8_m_8_sm_80_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 256, 1, 8, 8, 128, true, false, kSM_80, xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_pagedKV_128_nqpkv_8_m_8_sm_80_cubin, xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_pagedKV_128_nqpkv_8_m_8_sm_80_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 1, 0, 16, 0, false, true, kSM_80, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_nqpkv_0_m_16_sm_80_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_nqpkv_0_m_16_sm_80_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 1, 0, 32, 0, false, true, kSM_80, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_nqpkv_0_m_32_sm_80_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_nqpkv_0_m_32_sm_80_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 1, 0, 16, 64, true, true, kSM_80, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_64_nqpkv_0_m_16_sm_80_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_64_nqpkv_0_m_16_sm_80_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 1, 0, 32, 64, true, true, kSM_80, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_64_nqpkv_0_m_32_sm_80_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_64_nqpkv_0_m_32_sm_80_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 1, 0, 16, 128, true, true, kSM_80, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_128_nqpkv_0_m_16_sm_80_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_128_nqpkv_0_m_16_sm_80_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 1, 0, 32, 128, true, true, kSM_80, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_128_nqpkv_0_m_32_sm_80_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_128_nqpkv_0_m_32_sm_80_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 128, 1, 0, 16, 0, false, true, kSM_80, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_nqpkv_0_m_16_sm_80_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_nqpkv_0_m_16_sm_80_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 128, 1, 0, 32, 0, false, true, kSM_80, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_nqpkv_0_m_32_sm_80_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_nqpkv_0_m_32_sm_80_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 128, 1, 0, 16, 64, true, true, kSM_80, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_16_sm_80_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_16_sm_80_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 128, 1, 0, 32, 64, true, true, kSM_80, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_32_sm_80_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_32_sm_80_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 128, 1, 0, 16, 128, true, true, kSM_80, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_0_m_16_sm_80_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_0_m_16_sm_80_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 128, 1, 0, 32, 128, true, true, kSM_80, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_0_m_32_sm_80_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_0_m_32_sm_80_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 1, 0, 16, 0, false, true, kSM_80, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_nqpkv_0_m_16_sm_80_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_nqpkv_0_m_16_sm_80_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 1, 0, 32, 0, false, true, kSM_80, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_nqpkv_0_m_32_sm_80_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_nqpkv_0_m_32_sm_80_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 1, 0, 16, 64, true, true, kSM_80, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_64_nqpkv_0_m_16_sm_80_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_64_nqpkv_0_m_16_sm_80_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 1, 0, 32, 64, true, true, kSM_80, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_64_nqpkv_0_m_32_sm_80_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_64_nqpkv_0_m_32_sm_80_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 1, 0, 16, 128, true, true, kSM_80, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_128_nqpkv_0_m_16_sm_80_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_128_nqpkv_0_m_16_sm_80_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 1, 0, 32, 128, true, true, kSM_80, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_128_nqpkv_0_m_32_sm_80_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_128_nqpkv_0_m_32_sm_80_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 128, 1, 0, 16, 0, false, true, kSM_80, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_nqpkv_0_m_16_sm_80_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_nqpkv_0_m_16_sm_80_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 128, 1, 0, 32, 0, false, true, kSM_80, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_nqpkv_0_m_32_sm_80_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_nqpkv_0_m_32_sm_80_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 128, 1, 0, 16, 64, true, true, kSM_80, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_16_sm_80_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_16_sm_80_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 128, 1, 0, 32, 64, true, true, kSM_80, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_32_sm_80_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_32_sm_80_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 128, 1, 0, 16, 128, true, true, kSM_80, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_0_m_16_sm_80_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_0_m_16_sm_80_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 128, 1, 0, 32, 128, true, true, kSM_80, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_0_m_32_sm_80_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_0_m_32_sm_80_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 1, 16, 16, 0, false, false, kSM_80, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_nqpkv_16_m_16_sm_80_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_nqpkv_16_m_16_sm_80_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 1, 16, 16, 64, true, false, kSM_80, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_64_nqpkv_16_m_16_sm_80_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_64_nqpkv_16_m_16_sm_80_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 1, 16, 16, 128, true, false, kSM_80, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_128_nqpkv_16_m_16_sm_80_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_128_nqpkv_16_m_16_sm_80_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 128, 1, 16, 16, 0, false, false, kSM_80, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_nqpkv_16_m_16_sm_80_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_nqpkv_16_m_16_sm_80_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 128, 1, 16, 16, 64, true, false, kSM_80, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_16_m_16_sm_80_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_16_m_16_sm_80_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 128, 1, 16, 16, 128, true, false, kSM_80, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_16_m_16_sm_80_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_16_m_16_sm_80_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 256, 1, 16, 16, 0, false, false, kSM_80, xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_nqpkv_16_m_16_sm_80_cubin, xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_nqpkv_16_m_16_sm_80_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 256, 1, 16, 16, 64, true, false, kSM_80, xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_pagedKV_64_nqpkv_16_m_16_sm_80_cubin, xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_pagedKV_64_nqpkv_16_m_16_sm_80_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 256, 1, 16, 16, 128, true, false, kSM_80, xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_pagedKV_128_nqpkv_16_m_16_sm_80_cubin, xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_pagedKV_128_nqpkv_16_m_16_sm_80_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 256, 1, 16, 16, 0, false, false, kSM_80, xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_nqpkv_16_m_16_sm_80_cubin, xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_nqpkv_16_m_16_sm_80_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 256, 1, 16, 16, 64, true, false, kSM_80, xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_pagedKV_64_nqpkv_16_m_16_sm_80_cubin, xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_pagedKV_64_nqpkv_16_m_16_sm_80_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 256, 1, 16, 16, 128, true, false, kSM_80, xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_pagedKV_128_nqpkv_16_m_16_sm_80_cubin, xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_pagedKV_128_nqpkv_16_m_16_sm_80_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 1, 16, 16, 0, false, false, kSM_80, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_nqpkv_16_m_16_sm_80_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_nqpkv_16_m_16_sm_80_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 1, 16, 16, 64, true, false, kSM_80, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_64_nqpkv_16_m_16_sm_80_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_64_nqpkv_16_m_16_sm_80_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 1, 16, 16, 128, true, false, kSM_80, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_128_nqpkv_16_m_16_sm_80_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_128_nqpkv_16_m_16_sm_80_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 128, 1, 16, 16, 0, false, false, kSM_80, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_nqpkv_16_m_16_sm_80_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_nqpkv_16_m_16_sm_80_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 128, 1, 16, 16, 64, true, false, kSM_80, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_16_m_16_sm_80_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_16_m_16_sm_80_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 128, 1, 16, 16, 128, true, false, kSM_80, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_16_m_16_sm_80_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_16_m_16_sm_80_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 256, 1, 16, 16, 0, false, false, kSM_80, xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_nqpkv_16_m_16_sm_80_cubin, xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_nqpkv_16_m_16_sm_80_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 256, 1, 16, 16, 64, true, false, kSM_80, xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_pagedKV_64_nqpkv_16_m_16_sm_80_cubin, xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_pagedKV_64_nqpkv_16_m_16_sm_80_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 256, 1, 16, 16, 128, true, false, kSM_80, xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_pagedKV_128_nqpkv_16_m_16_sm_80_cubin, xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_pagedKV_128_nqpkv_16_m_16_sm_80_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 256, 1, 16, 16, 0, false, false, kSM_80, xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_nqpkv_16_m_16_sm_80_cubin, xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_nqpkv_16_m_16_sm_80_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 256, 1, 16, 16, 64, true, false, kSM_80, xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_pagedKV_64_nqpkv_16_m_16_sm_80_cubin, xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_pagedKV_64_nqpkv_16_m_16_sm_80_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 256, 1, 16, 16, 128, true, false, kSM_80, xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_pagedKV_128_nqpkv_16_m_16_sm_80_cubin, xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_pagedKV_128_nqpkv_16_m_16_sm_80_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 256, 4, 1, 1, 64, true, false, kSM_80, xqa_kernel_dt_fp16_d_256_beam_4_kvt_fp16_pagedKV_64_nqpkv_1_m_1_sm_80_cubin, xqa_kernel_dt_fp16_d_256_beam_4_kvt_fp16_pagedKV_64_nqpkv_1_m_1_sm_80_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 256, 4, 1, 1, 128, true, false, kSM_80, xqa_kernel_dt_fp16_d_256_beam_4_kvt_fp16_pagedKV_128_nqpkv_1_m_1_sm_80_cubin, xqa_kernel_dt_fp16_d_256_beam_4_kvt_fp16_pagedKV_128_nqpkv_1_m_1_sm_80_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 256, 4, 1, 1, 64, true, false, kSM_80, xqa_kernel_dt_fp16_d_256_beam_4_kvt_int8_pagedKV_64_nqpkv_1_m_1_sm_80_cubin, xqa_kernel_dt_fp16_d_256_beam_4_kvt_int8_pagedKV_64_nqpkv_1_m_1_sm_80_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 256, 4, 1, 1, 128, true, false, kSM_80, xqa_kernel_dt_fp16_d_256_beam_4_kvt_int8_pagedKV_128_nqpkv_1_m_1_sm_80_cubin, xqa_kernel_dt_fp16_d_256_beam_4_kvt_int8_pagedKV_128_nqpkv_1_m_1_sm_80_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 256, 4, 1, 1, 64, true, false, kSM_80, xqa_kernel_dt_bf16_d_256_beam_4_kvt_bf16_pagedKV_64_nqpkv_1_m_1_sm_80_cubin, xqa_kernel_dt_bf16_d_256_beam_4_kvt_bf16_pagedKV_64_nqpkv_1_m_1_sm_80_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 256, 4, 1, 1, 128, true, false, kSM_80, xqa_kernel_dt_bf16_d_256_beam_4_kvt_bf16_pagedKV_128_nqpkv_1_m_1_sm_80_cubin, xqa_kernel_dt_bf16_d_256_beam_4_kvt_bf16_pagedKV_128_nqpkv_1_m_1_sm_80_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 256, 4, 1, 1, 64, true, false, kSM_80, xqa_kernel_dt_bf16_d_256_beam_4_kvt_int8_pagedKV_64_nqpkv_1_m_1_sm_80_cubin, xqa_kernel_dt_bf16_d_256_beam_4_kvt_int8_pagedKV_64_nqpkv_1_m_1_sm_80_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 256, 4, 1, 1, 128, true, false, kSM_80, xqa_kernel_dt_bf16_d_256_beam_4_kvt_int8_pagedKV_128_nqpkv_1_m_1_sm_80_cubin, xqa_kernel_dt_bf16_d_256_beam_4_kvt_int8_pagedKV_128_nqpkv_1_m_1_sm_80_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 64, 1, 0, 16, 64, true, true, kSM_80, xqa_kernel_dt_fp16_d_64_beam_1_kvt_fp16_pagedKV_64_nqpkv_0_m_16_sm_80_cubin, xqa_kernel_dt_fp16_d_64_beam_1_kvt_fp16_pagedKV_64_nqpkv_0_m_16_sm_80_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 64, 1, 0, 32, 64, true, true, kSM_80, xqa_kernel_dt_fp16_d_64_beam_1_kvt_fp16_pagedKV_64_nqpkv_0_m_32_sm_80_cubin, xqa_kernel_dt_fp16_d_64_beam_1_kvt_fp16_pagedKV_64_nqpkv_0_m_32_sm_80_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 64, 1, 0, 16, 64, true, true, kSM_80, xqa_kernel_dt_fp16_d_64_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_16_sm_80_cubin, xqa_kernel_dt_fp16_d_64_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_16_sm_80_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 64, 1, 0, 32, 64, true, true, kSM_80, xqa_kernel_dt_fp16_d_64_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_32_sm_80_cubin, xqa_kernel_dt_fp16_d_64_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_32_sm_80_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 1, 0, 16, 64, true, true, kSM_80, xqa_kernel_dt_bf16_d_64_beam_1_kvt_bf16_pagedKV_64_nqpkv_0_m_16_sm_80_cubin, xqa_kernel_dt_bf16_d_64_beam_1_kvt_bf16_pagedKV_64_nqpkv_0_m_16_sm_80_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 1, 0, 32, 64, true, true, kSM_80, xqa_kernel_dt_bf16_d_64_beam_1_kvt_bf16_pagedKV_64_nqpkv_0_m_32_sm_80_cubin, xqa_kernel_dt_bf16_d_64_beam_1_kvt_bf16_pagedKV_64_nqpkv_0_m_32_sm_80_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 64, 1, 0, 16, 64, true, true, kSM_80, xqa_kernel_dt_bf16_d_64_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_16_sm_80_cubin, xqa_kernel_dt_bf16_d_64_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_16_sm_80_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 64, 1, 0, 32, 64, true, true, kSM_80, xqa_kernel_dt_bf16_d_64_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_32_sm_80_cubin, xqa_kernel_dt_bf16_d_64_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_32_sm_80_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 1, 0, 16, 32, true, true, kSM_80, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_32_nqpkv_0_m_16_sm_80_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_32_nqpkv_0_m_16_sm_80_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 1, 0, 32, 32, true, true, kSM_80, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_32_nqpkv_0_m_32_sm_80_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_32_nqpkv_0_m_32_sm_80_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 128, 1, 0, 16, 32, true, true, kSM_80, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_16_sm_80_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_16_sm_80_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 128, 1, 0, 32, 32, true, true, kSM_80, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_32_sm_80_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_32_sm_80_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 1, 0, 16, 32, true, true, kSM_80, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_32_nqpkv_0_m_16_sm_80_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_32_nqpkv_0_m_16_sm_80_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 1, 0, 32, 32, true, true, kSM_80, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_32_nqpkv_0_m_32_sm_80_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_32_nqpkv_0_m_32_sm_80_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 128, 1, 0, 16, 32, true, true, kSM_80, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_16_sm_80_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_16_sm_80_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 128, 1, 0, 32, 32, true, true, kSM_80, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_32_sm_80_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_32_sm_80_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 64, 1, 0, 16, 32, true, true, kSM_80, xqa_kernel_dt_fp16_d_64_beam_1_kvt_fp16_pagedKV_32_nqpkv_0_m_16_sm_80_cubin, xqa_kernel_dt_fp16_d_64_beam_1_kvt_fp16_pagedKV_32_nqpkv_0_m_16_sm_80_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 64, 1, 0, 32, 32, true, true, kSM_80, xqa_kernel_dt_fp16_d_64_beam_1_kvt_fp16_pagedKV_32_nqpkv_0_m_32_sm_80_cubin, xqa_kernel_dt_fp16_d_64_beam_1_kvt_fp16_pagedKV_32_nqpkv_0_m_32_sm_80_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 64, 1, 0, 16, 32, true, true, kSM_80, xqa_kernel_dt_fp16_d_64_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_16_sm_80_cubin, xqa_kernel_dt_fp16_d_64_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_16_sm_80_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 64, 1, 0, 32, 32, true, true, kSM_80, xqa_kernel_dt_fp16_d_64_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_32_sm_80_cubin, xqa_kernel_dt_fp16_d_64_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_32_sm_80_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 1, 0, 16, 32, true, true, kSM_80, xqa_kernel_dt_bf16_d_64_beam_1_kvt_bf16_pagedKV_32_nqpkv_0_m_16_sm_80_cubin, xqa_kernel_dt_bf16_d_64_beam_1_kvt_bf16_pagedKV_32_nqpkv_0_m_16_sm_80_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 1, 0, 32, 32, true, true, kSM_80, xqa_kernel_dt_bf16_d_64_beam_1_kvt_bf16_pagedKV_32_nqpkv_0_m_32_sm_80_cubin, xqa_kernel_dt_bf16_d_64_beam_1_kvt_bf16_pagedKV_32_nqpkv_0_m_32_sm_80_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 64, 1, 0, 16, 32, true, true, kSM_80, xqa_kernel_dt_bf16_d_64_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_16_sm_80_cubin, xqa_kernel_dt_bf16_d_64_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_16_sm_80_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 64, 1, 0, 32, 32, true, true, kSM_80, xqa_kernel_dt_bf16_d_64_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_32_sm_80_cubin, xqa_kernel_dt_bf16_d_64_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_32_sm_80_cubin_len, "kernel_mha"}, -#endif -#ifndef EXCLUDE_SM_86 -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 1, 8, 8, 0, false, false, kSM_86, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_nqpkv_8_m_8_sm_86_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_nqpkv_8_m_8_sm_86_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 1, 8, 8, 64, true, false, kSM_86, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_64_nqpkv_8_m_8_sm_86_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_64_nqpkv_8_m_8_sm_86_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 1, 8, 8, 128, true, false, kSM_86, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_128_nqpkv_8_m_8_sm_86_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_128_nqpkv_8_m_8_sm_86_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 128, 1, 8, 8, 0, false, false, kSM_86, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_nqpkv_8_m_8_sm_86_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_nqpkv_8_m_8_sm_86_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 128, 1, 8, 8, 64, true, false, kSM_86, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_8_m_8_sm_86_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_8_m_8_sm_86_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 128, 1, 8, 8, 128, true, false, kSM_86, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_8_m_8_sm_86_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_8_m_8_sm_86_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 256, 1, 8, 8, 0, false, false, kSM_86, xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_nqpkv_8_m_8_sm_86_cubin, xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_nqpkv_8_m_8_sm_86_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 256, 1, 8, 8, 64, true, false, kSM_86, xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_pagedKV_64_nqpkv_8_m_8_sm_86_cubin, xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_pagedKV_64_nqpkv_8_m_8_sm_86_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 256, 1, 8, 8, 128, true, false, kSM_86, xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_pagedKV_128_nqpkv_8_m_8_sm_86_cubin, xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_pagedKV_128_nqpkv_8_m_8_sm_86_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 256, 1, 8, 8, 0, false, false, kSM_86, xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_nqpkv_8_m_8_sm_86_cubin, xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_nqpkv_8_m_8_sm_86_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 256, 1, 8, 8, 64, true, false, kSM_86, xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_pagedKV_64_nqpkv_8_m_8_sm_86_cubin, xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_pagedKV_64_nqpkv_8_m_8_sm_86_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 256, 1, 8, 8, 128, true, false, kSM_86, xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_pagedKV_128_nqpkv_8_m_8_sm_86_cubin, xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_pagedKV_128_nqpkv_8_m_8_sm_86_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 1, 8, 8, 0, false, false, kSM_86, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_nqpkv_8_m_8_sm_86_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_nqpkv_8_m_8_sm_86_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 1, 8, 8, 64, true, false, kSM_86, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_64_nqpkv_8_m_8_sm_86_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_64_nqpkv_8_m_8_sm_86_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 1, 8, 8, 128, true, false, kSM_86, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_128_nqpkv_8_m_8_sm_86_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_128_nqpkv_8_m_8_sm_86_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 128, 1, 8, 8, 0, false, false, kSM_86, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_nqpkv_8_m_8_sm_86_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_nqpkv_8_m_8_sm_86_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 128, 1, 8, 8, 64, true, false, kSM_86, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_8_m_8_sm_86_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_8_m_8_sm_86_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 128, 1, 8, 8, 128, true, false, kSM_86, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_8_m_8_sm_86_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_8_m_8_sm_86_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 256, 1, 8, 8, 0, false, false, kSM_86, xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_nqpkv_8_m_8_sm_86_cubin, xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_nqpkv_8_m_8_sm_86_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 256, 1, 8, 8, 64, true, false, kSM_86, xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_pagedKV_64_nqpkv_8_m_8_sm_86_cubin, xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_pagedKV_64_nqpkv_8_m_8_sm_86_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 256, 1, 8, 8, 128, true, false, kSM_86, xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_pagedKV_128_nqpkv_8_m_8_sm_86_cubin, xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_pagedKV_128_nqpkv_8_m_8_sm_86_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 256, 1, 8, 8, 0, false, false, kSM_86, xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_nqpkv_8_m_8_sm_86_cubin, xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_nqpkv_8_m_8_sm_86_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 256, 1, 8, 8, 64, true, false, kSM_86, xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_pagedKV_64_nqpkv_8_m_8_sm_86_cubin, xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_pagedKV_64_nqpkv_8_m_8_sm_86_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 256, 1, 8, 8, 128, true, false, kSM_86, xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_pagedKV_128_nqpkv_8_m_8_sm_86_cubin, xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_pagedKV_128_nqpkv_8_m_8_sm_86_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 1, 0, 16, 0, false, true, kSM_86, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_nqpkv_0_m_16_sm_86_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_nqpkv_0_m_16_sm_86_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 1, 0, 32, 0, false, true, kSM_86, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_nqpkv_0_m_32_sm_86_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_nqpkv_0_m_32_sm_86_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 1, 0, 16, 64, true, true, kSM_86, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_64_nqpkv_0_m_16_sm_86_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_64_nqpkv_0_m_16_sm_86_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 1, 0, 32, 64, true, true, kSM_86, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_64_nqpkv_0_m_32_sm_86_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_64_nqpkv_0_m_32_sm_86_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 1, 0, 16, 128, true, true, kSM_86, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_128_nqpkv_0_m_16_sm_86_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_128_nqpkv_0_m_16_sm_86_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 1, 0, 32, 128, true, true, kSM_86, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_128_nqpkv_0_m_32_sm_86_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_128_nqpkv_0_m_32_sm_86_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 128, 1, 0, 16, 0, false, true, kSM_86, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_nqpkv_0_m_16_sm_86_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_nqpkv_0_m_16_sm_86_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 128, 1, 0, 32, 0, false, true, kSM_86, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_nqpkv_0_m_32_sm_86_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_nqpkv_0_m_32_sm_86_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 128, 1, 0, 16, 64, true, true, kSM_86, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_16_sm_86_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_16_sm_86_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 128, 1, 0, 32, 64, true, true, kSM_86, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_32_sm_86_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_32_sm_86_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 128, 1, 0, 16, 128, true, true, kSM_86, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_0_m_16_sm_86_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_0_m_16_sm_86_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 128, 1, 0, 32, 128, true, true, kSM_86, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_0_m_32_sm_86_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_0_m_32_sm_86_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 1, 0, 16, 0, false, true, kSM_86, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_nqpkv_0_m_16_sm_86_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_nqpkv_0_m_16_sm_86_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 1, 0, 32, 0, false, true, kSM_86, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_nqpkv_0_m_32_sm_86_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_nqpkv_0_m_32_sm_86_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 1, 0, 16, 64, true, true, kSM_86, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_64_nqpkv_0_m_16_sm_86_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_64_nqpkv_0_m_16_sm_86_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 1, 0, 32, 64, true, true, kSM_86, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_64_nqpkv_0_m_32_sm_86_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_64_nqpkv_0_m_32_sm_86_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 1, 0, 16, 128, true, true, kSM_86, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_128_nqpkv_0_m_16_sm_86_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_128_nqpkv_0_m_16_sm_86_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 1, 0, 32, 128, true, true, kSM_86, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_128_nqpkv_0_m_32_sm_86_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_128_nqpkv_0_m_32_sm_86_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 128, 1, 0, 16, 0, false, true, kSM_86, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_nqpkv_0_m_16_sm_86_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_nqpkv_0_m_16_sm_86_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 128, 1, 0, 32, 0, false, true, kSM_86, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_nqpkv_0_m_32_sm_86_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_nqpkv_0_m_32_sm_86_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 128, 1, 0, 16, 64, true, true, kSM_86, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_16_sm_86_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_16_sm_86_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 128, 1, 0, 32, 64, true, true, kSM_86, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_32_sm_86_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_32_sm_86_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 128, 1, 0, 16, 128, true, true, kSM_86, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_0_m_16_sm_86_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_0_m_16_sm_86_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 128, 1, 0, 32, 128, true, true, kSM_86, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_0_m_32_sm_86_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_0_m_32_sm_86_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 1, 16, 16, 0, false, false, kSM_86, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_nqpkv_16_m_16_sm_86_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_nqpkv_16_m_16_sm_86_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 1, 16, 16, 64, true, false, kSM_86, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_64_nqpkv_16_m_16_sm_86_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_64_nqpkv_16_m_16_sm_86_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 1, 16, 16, 128, true, false, kSM_86, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_128_nqpkv_16_m_16_sm_86_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_128_nqpkv_16_m_16_sm_86_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 128, 1, 16, 16, 0, false, false, kSM_86, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_nqpkv_16_m_16_sm_86_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_nqpkv_16_m_16_sm_86_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 128, 1, 16, 16, 64, true, false, kSM_86, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_16_m_16_sm_86_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_16_m_16_sm_86_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 128, 1, 16, 16, 128, true, false, kSM_86, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_16_m_16_sm_86_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_16_m_16_sm_86_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 256, 1, 16, 16, 0, false, false, kSM_86, xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_nqpkv_16_m_16_sm_86_cubin, xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_nqpkv_16_m_16_sm_86_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 256, 1, 16, 16, 64, true, false, kSM_86, xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_pagedKV_64_nqpkv_16_m_16_sm_86_cubin, xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_pagedKV_64_nqpkv_16_m_16_sm_86_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 256, 1, 16, 16, 128, true, false, kSM_86, xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_pagedKV_128_nqpkv_16_m_16_sm_86_cubin, xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_pagedKV_128_nqpkv_16_m_16_sm_86_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 256, 1, 16, 16, 0, false, false, kSM_86, xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_nqpkv_16_m_16_sm_86_cubin, xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_nqpkv_16_m_16_sm_86_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 256, 1, 16, 16, 64, true, false, kSM_86, xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_pagedKV_64_nqpkv_16_m_16_sm_86_cubin, xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_pagedKV_64_nqpkv_16_m_16_sm_86_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 256, 1, 16, 16, 128, true, false, kSM_86, xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_pagedKV_128_nqpkv_16_m_16_sm_86_cubin, xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_pagedKV_128_nqpkv_16_m_16_sm_86_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 1, 16, 16, 0, false, false, kSM_86, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_nqpkv_16_m_16_sm_86_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_nqpkv_16_m_16_sm_86_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 1, 16, 16, 64, true, false, kSM_86, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_64_nqpkv_16_m_16_sm_86_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_64_nqpkv_16_m_16_sm_86_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 1, 16, 16, 128, true, false, kSM_86, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_128_nqpkv_16_m_16_sm_86_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_128_nqpkv_16_m_16_sm_86_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 128, 1, 16, 16, 0, false, false, kSM_86, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_nqpkv_16_m_16_sm_86_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_nqpkv_16_m_16_sm_86_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 128, 1, 16, 16, 64, true, false, kSM_86, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_16_m_16_sm_86_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_16_m_16_sm_86_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 128, 1, 16, 16, 128, true, false, kSM_86, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_16_m_16_sm_86_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_16_m_16_sm_86_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 256, 1, 16, 16, 0, false, false, kSM_86, xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_nqpkv_16_m_16_sm_86_cubin, xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_nqpkv_16_m_16_sm_86_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 256, 1, 16, 16, 64, true, false, kSM_86, xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_pagedKV_64_nqpkv_16_m_16_sm_86_cubin, xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_pagedKV_64_nqpkv_16_m_16_sm_86_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 256, 1, 16, 16, 128, true, false, kSM_86, xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_pagedKV_128_nqpkv_16_m_16_sm_86_cubin, xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_pagedKV_128_nqpkv_16_m_16_sm_86_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 256, 1, 16, 16, 0, false, false, kSM_86, xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_nqpkv_16_m_16_sm_86_cubin, xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_nqpkv_16_m_16_sm_86_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 256, 1, 16, 16, 64, true, false, kSM_86, xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_pagedKV_64_nqpkv_16_m_16_sm_86_cubin, xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_pagedKV_64_nqpkv_16_m_16_sm_86_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 256, 1, 16, 16, 128, true, false, kSM_86, xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_pagedKV_128_nqpkv_16_m_16_sm_86_cubin, xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_pagedKV_128_nqpkv_16_m_16_sm_86_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 256, 4, 1, 1, 64, true, false, kSM_86, xqa_kernel_dt_fp16_d_256_beam_4_kvt_fp16_pagedKV_64_nqpkv_1_m_1_sm_86_cubin, xqa_kernel_dt_fp16_d_256_beam_4_kvt_fp16_pagedKV_64_nqpkv_1_m_1_sm_86_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 256, 4, 1, 1, 128, true, false, kSM_86, xqa_kernel_dt_fp16_d_256_beam_4_kvt_fp16_pagedKV_128_nqpkv_1_m_1_sm_86_cubin, xqa_kernel_dt_fp16_d_256_beam_4_kvt_fp16_pagedKV_128_nqpkv_1_m_1_sm_86_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 256, 4, 1, 1, 64, true, false, kSM_86, xqa_kernel_dt_fp16_d_256_beam_4_kvt_int8_pagedKV_64_nqpkv_1_m_1_sm_86_cubin, xqa_kernel_dt_fp16_d_256_beam_4_kvt_int8_pagedKV_64_nqpkv_1_m_1_sm_86_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 256, 4, 1, 1, 128, true, false, kSM_86, xqa_kernel_dt_fp16_d_256_beam_4_kvt_int8_pagedKV_128_nqpkv_1_m_1_sm_86_cubin, xqa_kernel_dt_fp16_d_256_beam_4_kvt_int8_pagedKV_128_nqpkv_1_m_1_sm_86_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 256, 4, 1, 1, 64, true, false, kSM_86, xqa_kernel_dt_bf16_d_256_beam_4_kvt_bf16_pagedKV_64_nqpkv_1_m_1_sm_86_cubin, xqa_kernel_dt_bf16_d_256_beam_4_kvt_bf16_pagedKV_64_nqpkv_1_m_1_sm_86_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 256, 4, 1, 1, 128, true, false, kSM_86, xqa_kernel_dt_bf16_d_256_beam_4_kvt_bf16_pagedKV_128_nqpkv_1_m_1_sm_86_cubin, xqa_kernel_dt_bf16_d_256_beam_4_kvt_bf16_pagedKV_128_nqpkv_1_m_1_sm_86_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 256, 4, 1, 1, 64, true, false, kSM_86, xqa_kernel_dt_bf16_d_256_beam_4_kvt_int8_pagedKV_64_nqpkv_1_m_1_sm_86_cubin, xqa_kernel_dt_bf16_d_256_beam_4_kvt_int8_pagedKV_64_nqpkv_1_m_1_sm_86_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 256, 4, 1, 1, 128, true, false, kSM_86, xqa_kernel_dt_bf16_d_256_beam_4_kvt_int8_pagedKV_128_nqpkv_1_m_1_sm_86_cubin, xqa_kernel_dt_bf16_d_256_beam_4_kvt_int8_pagedKV_128_nqpkv_1_m_1_sm_86_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 64, 1, 0, 16, 64, true, true, kSM_86, xqa_kernel_dt_fp16_d_64_beam_1_kvt_fp16_pagedKV_64_nqpkv_0_m_16_sm_86_cubin, xqa_kernel_dt_fp16_d_64_beam_1_kvt_fp16_pagedKV_64_nqpkv_0_m_16_sm_86_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 64, 1, 0, 32, 64, true, true, kSM_86, xqa_kernel_dt_fp16_d_64_beam_1_kvt_fp16_pagedKV_64_nqpkv_0_m_32_sm_86_cubin, xqa_kernel_dt_fp16_d_64_beam_1_kvt_fp16_pagedKV_64_nqpkv_0_m_32_sm_86_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 64, 1, 0, 16, 64, true, true, kSM_86, xqa_kernel_dt_fp16_d_64_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_16_sm_86_cubin, xqa_kernel_dt_fp16_d_64_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_16_sm_86_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 64, 1, 0, 32, 64, true, true, kSM_86, xqa_kernel_dt_fp16_d_64_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_32_sm_86_cubin, xqa_kernel_dt_fp16_d_64_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_32_sm_86_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 1, 0, 16, 64, true, true, kSM_86, xqa_kernel_dt_bf16_d_64_beam_1_kvt_bf16_pagedKV_64_nqpkv_0_m_16_sm_86_cubin, xqa_kernel_dt_bf16_d_64_beam_1_kvt_bf16_pagedKV_64_nqpkv_0_m_16_sm_86_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 1, 0, 32, 64, true, true, kSM_86, xqa_kernel_dt_bf16_d_64_beam_1_kvt_bf16_pagedKV_64_nqpkv_0_m_32_sm_86_cubin, xqa_kernel_dt_bf16_d_64_beam_1_kvt_bf16_pagedKV_64_nqpkv_0_m_32_sm_86_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 64, 1, 0, 16, 64, true, true, kSM_86, xqa_kernel_dt_bf16_d_64_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_16_sm_86_cubin, xqa_kernel_dt_bf16_d_64_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_16_sm_86_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 64, 1, 0, 32, 64, true, true, kSM_86, xqa_kernel_dt_bf16_d_64_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_32_sm_86_cubin, xqa_kernel_dt_bf16_d_64_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_32_sm_86_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 1, 0, 16, 32, true, true, kSM_86, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_32_nqpkv_0_m_16_sm_86_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_32_nqpkv_0_m_16_sm_86_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 1, 0, 32, 32, true, true, kSM_86, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_32_nqpkv_0_m_32_sm_86_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_32_nqpkv_0_m_32_sm_86_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 128, 1, 0, 16, 32, true, true, kSM_86, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_16_sm_86_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_16_sm_86_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 128, 1, 0, 32, 32, true, true, kSM_86, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_32_sm_86_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_32_sm_86_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 1, 0, 16, 32, true, true, kSM_86, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_32_nqpkv_0_m_16_sm_86_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_32_nqpkv_0_m_16_sm_86_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 1, 0, 32, 32, true, true, kSM_86, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_32_nqpkv_0_m_32_sm_86_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_32_nqpkv_0_m_32_sm_86_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 128, 1, 0, 16, 32, true, true, kSM_86, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_16_sm_86_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_16_sm_86_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 128, 1, 0, 32, 32, true, true, kSM_86, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_32_sm_86_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_32_sm_86_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 64, 1, 0, 16, 32, true, true, kSM_86, xqa_kernel_dt_fp16_d_64_beam_1_kvt_fp16_pagedKV_32_nqpkv_0_m_16_sm_86_cubin, xqa_kernel_dt_fp16_d_64_beam_1_kvt_fp16_pagedKV_32_nqpkv_0_m_16_sm_86_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 64, 1, 0, 32, 32, true, true, kSM_86, xqa_kernel_dt_fp16_d_64_beam_1_kvt_fp16_pagedKV_32_nqpkv_0_m_32_sm_86_cubin, xqa_kernel_dt_fp16_d_64_beam_1_kvt_fp16_pagedKV_32_nqpkv_0_m_32_sm_86_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 64, 1, 0, 16, 32, true, true, kSM_86, xqa_kernel_dt_fp16_d_64_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_16_sm_86_cubin, xqa_kernel_dt_fp16_d_64_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_16_sm_86_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 64, 1, 0, 32, 32, true, true, kSM_86, xqa_kernel_dt_fp16_d_64_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_32_sm_86_cubin, xqa_kernel_dt_fp16_d_64_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_32_sm_86_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 1, 0, 16, 32, true, true, kSM_86, xqa_kernel_dt_bf16_d_64_beam_1_kvt_bf16_pagedKV_32_nqpkv_0_m_16_sm_86_cubin, xqa_kernel_dt_bf16_d_64_beam_1_kvt_bf16_pagedKV_32_nqpkv_0_m_16_sm_86_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 1, 0, 32, 32, true, true, kSM_86, xqa_kernel_dt_bf16_d_64_beam_1_kvt_bf16_pagedKV_32_nqpkv_0_m_32_sm_86_cubin, xqa_kernel_dt_bf16_d_64_beam_1_kvt_bf16_pagedKV_32_nqpkv_0_m_32_sm_86_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 64, 1, 0, 16, 32, true, true, kSM_86, xqa_kernel_dt_bf16_d_64_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_16_sm_86_cubin, xqa_kernel_dt_bf16_d_64_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_16_sm_86_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 64, 1, 0, 32, 32, true, true, kSM_86, xqa_kernel_dt_bf16_d_64_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_32_sm_86_cubin, xqa_kernel_dt_bf16_d_64_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_32_sm_86_cubin_len, "kernel_mha"}, -#endif -#ifndef EXCLUDE_SM_89 -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 1, 8, 8, 0, false, false, kSM_89, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_nqpkv_8_m_8_sm_89_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_nqpkv_8_m_8_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 1, 8, 8, 64, true, false, kSM_89, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_64_nqpkv_8_m_8_sm_89_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_64_nqpkv_8_m_8_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 1, 8, 8, 128, true, false, kSM_89, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_128_nqpkv_8_m_8_sm_89_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_128_nqpkv_8_m_8_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 128, 1, 8, 8, 0, false, false, kSM_89, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_nqpkv_8_m_8_sm_89_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_nqpkv_8_m_8_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 128, 1, 8, 8, 64, true, false, kSM_89, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_8_m_8_sm_89_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_8_m_8_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 128, 1, 8, 8, 128, true, false, kSM_89, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_8_m_8_sm_89_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_8_m_8_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_E4M3, 128, 1, 8, 8, 0, false, false, kSM_89, xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_nqpkv_8_m_8_sm_89_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_nqpkv_8_m_8_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_E4M3, 128, 1, 8, 8, 64, true, false, kSM_89, xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_64_nqpkv_8_m_8_sm_89_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_64_nqpkv_8_m_8_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_E4M3, 128, 1, 8, 8, 128, true, false, kSM_89, xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_128_nqpkv_8_m_8_sm_89_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_128_nqpkv_8_m_8_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 256, 1, 8, 8, 0, false, false, kSM_89, xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_nqpkv_8_m_8_sm_89_cubin, xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_nqpkv_8_m_8_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 256, 1, 8, 8, 64, true, false, kSM_89, xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_pagedKV_64_nqpkv_8_m_8_sm_89_cubin, xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_pagedKV_64_nqpkv_8_m_8_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 256, 1, 8, 8, 128, true, false, kSM_89, xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_pagedKV_128_nqpkv_8_m_8_sm_89_cubin, xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_pagedKV_128_nqpkv_8_m_8_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 256, 1, 8, 8, 0, false, false, kSM_89, xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_nqpkv_8_m_8_sm_89_cubin, xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_nqpkv_8_m_8_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 256, 1, 8, 8, 64, true, false, kSM_89, xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_pagedKV_64_nqpkv_8_m_8_sm_89_cubin, xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_pagedKV_64_nqpkv_8_m_8_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 256, 1, 8, 8, 128, true, false, kSM_89, xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_pagedKV_128_nqpkv_8_m_8_sm_89_cubin, xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_pagedKV_128_nqpkv_8_m_8_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_E4M3, 256, 1, 8, 8, 0, false, false, kSM_89, xqa_kernel_dt_fp16_d_256_beam_1_kvt_e4m3_nqpkv_8_m_8_sm_89_cubin, xqa_kernel_dt_fp16_d_256_beam_1_kvt_e4m3_nqpkv_8_m_8_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_E4M3, 256, 1, 8, 8, 64, true, false, kSM_89, xqa_kernel_dt_fp16_d_256_beam_1_kvt_e4m3_pagedKV_64_nqpkv_8_m_8_sm_89_cubin, xqa_kernel_dt_fp16_d_256_beam_1_kvt_e4m3_pagedKV_64_nqpkv_8_m_8_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_E4M3, 256, 1, 8, 8, 128, true, false, kSM_89, xqa_kernel_dt_fp16_d_256_beam_1_kvt_e4m3_pagedKV_128_nqpkv_8_m_8_sm_89_cubin, xqa_kernel_dt_fp16_d_256_beam_1_kvt_e4m3_pagedKV_128_nqpkv_8_m_8_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 1, 8, 8, 0, false, false, kSM_89, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_nqpkv_8_m_8_sm_89_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_nqpkv_8_m_8_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 1, 8, 8, 64, true, false, kSM_89, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_64_nqpkv_8_m_8_sm_89_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_64_nqpkv_8_m_8_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 1, 8, 8, 128, true, false, kSM_89, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_128_nqpkv_8_m_8_sm_89_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_128_nqpkv_8_m_8_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 128, 1, 8, 8, 0, false, false, kSM_89, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_nqpkv_8_m_8_sm_89_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_nqpkv_8_m_8_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 128, 1, 8, 8, 64, true, false, kSM_89, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_8_m_8_sm_89_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_8_m_8_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 128, 1, 8, 8, 128, true, false, kSM_89, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_8_m_8_sm_89_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_8_m_8_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_E4M3, 128, 1, 8, 8, 0, false, false, kSM_89, xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_nqpkv_8_m_8_sm_89_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_nqpkv_8_m_8_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_E4M3, 128, 1, 8, 8, 64, true, false, kSM_89, xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_64_nqpkv_8_m_8_sm_89_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_64_nqpkv_8_m_8_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_E4M3, 128, 1, 8, 8, 128, true, false, kSM_89, xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_128_nqpkv_8_m_8_sm_89_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_128_nqpkv_8_m_8_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 256, 1, 8, 8, 0, false, false, kSM_89, xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_nqpkv_8_m_8_sm_89_cubin, xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_nqpkv_8_m_8_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 256, 1, 8, 8, 64, true, false, kSM_89, xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_pagedKV_64_nqpkv_8_m_8_sm_89_cubin, xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_pagedKV_64_nqpkv_8_m_8_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 256, 1, 8, 8, 128, true, false, kSM_89, xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_pagedKV_128_nqpkv_8_m_8_sm_89_cubin, xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_pagedKV_128_nqpkv_8_m_8_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 256, 1, 8, 8, 0, false, false, kSM_89, xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_nqpkv_8_m_8_sm_89_cubin, xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_nqpkv_8_m_8_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 256, 1, 8, 8, 64, true, false, kSM_89, xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_pagedKV_64_nqpkv_8_m_8_sm_89_cubin, xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_pagedKV_64_nqpkv_8_m_8_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 256, 1, 8, 8, 128, true, false, kSM_89, xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_pagedKV_128_nqpkv_8_m_8_sm_89_cubin, xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_pagedKV_128_nqpkv_8_m_8_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_E4M3, 256, 1, 8, 8, 0, false, false, kSM_89, xqa_kernel_dt_bf16_d_256_beam_1_kvt_e4m3_nqpkv_8_m_8_sm_89_cubin, xqa_kernel_dt_bf16_d_256_beam_1_kvt_e4m3_nqpkv_8_m_8_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_E4M3, 256, 1, 8, 8, 64, true, false, kSM_89, xqa_kernel_dt_bf16_d_256_beam_1_kvt_e4m3_pagedKV_64_nqpkv_8_m_8_sm_89_cubin, xqa_kernel_dt_bf16_d_256_beam_1_kvt_e4m3_pagedKV_64_nqpkv_8_m_8_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_E4M3, 256, 1, 8, 8, 128, true, false, kSM_89, xqa_kernel_dt_bf16_d_256_beam_1_kvt_e4m3_pagedKV_128_nqpkv_8_m_8_sm_89_cubin, xqa_kernel_dt_bf16_d_256_beam_1_kvt_e4m3_pagedKV_128_nqpkv_8_m_8_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 1, 0, 16, 0, false, true, kSM_89, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_nqpkv_0_m_16_sm_89_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_nqpkv_0_m_16_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 1, 0, 32, 0, false, true, kSM_89, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_nqpkv_0_m_32_sm_89_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_nqpkv_0_m_32_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 1, 0, 16, 64, true, true, kSM_89, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_64_nqpkv_0_m_16_sm_89_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_64_nqpkv_0_m_16_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 1, 0, 32, 64, true, true, kSM_89, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_64_nqpkv_0_m_32_sm_89_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_64_nqpkv_0_m_32_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 1, 0, 16, 128, true, true, kSM_89, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_128_nqpkv_0_m_16_sm_89_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_128_nqpkv_0_m_16_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 1, 0, 32, 128, true, true, kSM_89, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_128_nqpkv_0_m_32_sm_89_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_128_nqpkv_0_m_32_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 128, 1, 0, 16, 0, false, true, kSM_89, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_nqpkv_0_m_16_sm_89_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_nqpkv_0_m_16_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 128, 1, 0, 32, 0, false, true, kSM_89, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_nqpkv_0_m_32_sm_89_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_nqpkv_0_m_32_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 128, 1, 0, 16, 64, true, true, kSM_89, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_16_sm_89_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_16_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 128, 1, 0, 32, 64, true, true, kSM_89, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_32_sm_89_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_32_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 128, 1, 0, 16, 128, true, true, kSM_89, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_0_m_16_sm_89_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_0_m_16_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 128, 1, 0, 32, 128, true, true, kSM_89, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_0_m_32_sm_89_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_0_m_32_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_E4M3, 128, 1, 0, 16, 0, false, true, kSM_89, xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_nqpkv_0_m_16_sm_89_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_nqpkv_0_m_16_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_E4M3, 128, 1, 0, 32, 0, false, true, kSM_89, xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_nqpkv_0_m_32_sm_89_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_nqpkv_0_m_32_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_E4M3, 128, 1, 0, 16, 64, true, true, kSM_89, xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_64_nqpkv_0_m_16_sm_89_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_64_nqpkv_0_m_16_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_E4M3, 128, 1, 0, 32, 64, true, true, kSM_89, xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_64_nqpkv_0_m_32_sm_89_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_64_nqpkv_0_m_32_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_E4M3, 128, 1, 0, 16, 128, true, true, kSM_89, xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_128_nqpkv_0_m_16_sm_89_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_128_nqpkv_0_m_16_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_E4M3, 128, 1, 0, 32, 128, true, true, kSM_89, xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_128_nqpkv_0_m_32_sm_89_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_128_nqpkv_0_m_32_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 1, 0, 16, 0, false, true, kSM_89, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_nqpkv_0_m_16_sm_89_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_nqpkv_0_m_16_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 1, 0, 32, 0, false, true, kSM_89, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_nqpkv_0_m_32_sm_89_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_nqpkv_0_m_32_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 1, 0, 16, 64, true, true, kSM_89, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_64_nqpkv_0_m_16_sm_89_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_64_nqpkv_0_m_16_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 1, 0, 32, 64, true, true, kSM_89, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_64_nqpkv_0_m_32_sm_89_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_64_nqpkv_0_m_32_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 1, 0, 16, 128, true, true, kSM_89, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_128_nqpkv_0_m_16_sm_89_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_128_nqpkv_0_m_16_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 1, 0, 32, 128, true, true, kSM_89, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_128_nqpkv_0_m_32_sm_89_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_128_nqpkv_0_m_32_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 128, 1, 0, 16, 0, false, true, kSM_89, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_nqpkv_0_m_16_sm_89_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_nqpkv_0_m_16_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 128, 1, 0, 32, 0, false, true, kSM_89, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_nqpkv_0_m_32_sm_89_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_nqpkv_0_m_32_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 128, 1, 0, 16, 64, true, true, kSM_89, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_16_sm_89_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_16_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 128, 1, 0, 32, 64, true, true, kSM_89, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_32_sm_89_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_32_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 128, 1, 0, 16, 128, true, true, kSM_89, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_0_m_16_sm_89_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_0_m_16_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 128, 1, 0, 32, 128, true, true, kSM_89, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_0_m_32_sm_89_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_0_m_32_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_E4M3, 128, 1, 0, 16, 0, false, true, kSM_89, xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_nqpkv_0_m_16_sm_89_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_nqpkv_0_m_16_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_E4M3, 128, 1, 0, 32, 0, false, true, kSM_89, xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_nqpkv_0_m_32_sm_89_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_nqpkv_0_m_32_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_E4M3, 128, 1, 0, 16, 64, true, true, kSM_89, xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_64_nqpkv_0_m_16_sm_89_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_64_nqpkv_0_m_16_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_E4M3, 128, 1, 0, 32, 64, true, true, kSM_89, xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_64_nqpkv_0_m_32_sm_89_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_64_nqpkv_0_m_32_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_E4M3, 128, 1, 0, 16, 128, true, true, kSM_89, xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_128_nqpkv_0_m_16_sm_89_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_128_nqpkv_0_m_16_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_E4M3, 128, 1, 0, 32, 128, true, true, kSM_89, xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_128_nqpkv_0_m_32_sm_89_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_128_nqpkv_0_m_32_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 1, 16, 16, 0, false, false, kSM_89, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_nqpkv_16_m_16_sm_89_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_nqpkv_16_m_16_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 1, 16, 16, 64, true, false, kSM_89, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_64_nqpkv_16_m_16_sm_89_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_64_nqpkv_16_m_16_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 1, 16, 16, 128, true, false, kSM_89, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_128_nqpkv_16_m_16_sm_89_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_128_nqpkv_16_m_16_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 128, 1, 16, 16, 0, false, false, kSM_89, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_nqpkv_16_m_16_sm_89_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_nqpkv_16_m_16_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 128, 1, 16, 16, 64, true, false, kSM_89, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_16_m_16_sm_89_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_16_m_16_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 128, 1, 16, 16, 128, true, false, kSM_89, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_16_m_16_sm_89_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_16_m_16_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_E4M3, 128, 1, 16, 16, 0, false, false, kSM_89, xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_nqpkv_16_m_16_sm_89_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_nqpkv_16_m_16_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_E4M3, 128, 1, 16, 16, 64, true, false, kSM_89, xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_64_nqpkv_16_m_16_sm_89_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_64_nqpkv_16_m_16_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_E4M3, 128, 1, 16, 16, 128, true, false, kSM_89, xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_128_nqpkv_16_m_16_sm_89_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_128_nqpkv_16_m_16_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 256, 1, 16, 16, 0, false, false, kSM_89, xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_nqpkv_16_m_16_sm_89_cubin, xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_nqpkv_16_m_16_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 256, 1, 16, 16, 64, true, false, kSM_89, xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_pagedKV_64_nqpkv_16_m_16_sm_89_cubin, xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_pagedKV_64_nqpkv_16_m_16_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 256, 1, 16, 16, 128, true, false, kSM_89, xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_pagedKV_128_nqpkv_16_m_16_sm_89_cubin, xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_pagedKV_128_nqpkv_16_m_16_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 256, 1, 16, 16, 0, false, false, kSM_89, xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_nqpkv_16_m_16_sm_89_cubin, xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_nqpkv_16_m_16_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 256, 1, 16, 16, 64, true, false, kSM_89, xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_pagedKV_64_nqpkv_16_m_16_sm_89_cubin, xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_pagedKV_64_nqpkv_16_m_16_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 256, 1, 16, 16, 128, true, false, kSM_89, xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_pagedKV_128_nqpkv_16_m_16_sm_89_cubin, xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_pagedKV_128_nqpkv_16_m_16_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_E4M3, 256, 1, 16, 16, 0, false, false, kSM_89, xqa_kernel_dt_fp16_d_256_beam_1_kvt_e4m3_nqpkv_16_m_16_sm_89_cubin, xqa_kernel_dt_fp16_d_256_beam_1_kvt_e4m3_nqpkv_16_m_16_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_E4M3, 256, 1, 16, 16, 64, true, false, kSM_89, xqa_kernel_dt_fp16_d_256_beam_1_kvt_e4m3_pagedKV_64_nqpkv_16_m_16_sm_89_cubin, xqa_kernel_dt_fp16_d_256_beam_1_kvt_e4m3_pagedKV_64_nqpkv_16_m_16_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_E4M3, 256, 1, 16, 16, 128, true, false, kSM_89, xqa_kernel_dt_fp16_d_256_beam_1_kvt_e4m3_pagedKV_128_nqpkv_16_m_16_sm_89_cubin, xqa_kernel_dt_fp16_d_256_beam_1_kvt_e4m3_pagedKV_128_nqpkv_16_m_16_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 1, 16, 16, 0, false, false, kSM_89, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_nqpkv_16_m_16_sm_89_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_nqpkv_16_m_16_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 1, 16, 16, 64, true, false, kSM_89, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_64_nqpkv_16_m_16_sm_89_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_64_nqpkv_16_m_16_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 1, 16, 16, 128, true, false, kSM_89, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_128_nqpkv_16_m_16_sm_89_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_128_nqpkv_16_m_16_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 128, 1, 16, 16, 0, false, false, kSM_89, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_nqpkv_16_m_16_sm_89_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_nqpkv_16_m_16_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 128, 1, 16, 16, 64, true, false, kSM_89, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_16_m_16_sm_89_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_16_m_16_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 128, 1, 16, 16, 128, true, false, kSM_89, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_16_m_16_sm_89_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_16_m_16_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_E4M3, 128, 1, 16, 16, 0, false, false, kSM_89, xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_nqpkv_16_m_16_sm_89_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_nqpkv_16_m_16_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_E4M3, 128, 1, 16, 16, 64, true, false, kSM_89, xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_64_nqpkv_16_m_16_sm_89_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_64_nqpkv_16_m_16_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_E4M3, 128, 1, 16, 16, 128, true, false, kSM_89, xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_128_nqpkv_16_m_16_sm_89_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_128_nqpkv_16_m_16_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 256, 1, 16, 16, 0, false, false, kSM_89, xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_nqpkv_16_m_16_sm_89_cubin, xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_nqpkv_16_m_16_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 256, 1, 16, 16, 64, true, false, kSM_89, xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_pagedKV_64_nqpkv_16_m_16_sm_89_cubin, xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_pagedKV_64_nqpkv_16_m_16_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 256, 1, 16, 16, 128, true, false, kSM_89, xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_pagedKV_128_nqpkv_16_m_16_sm_89_cubin, xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_pagedKV_128_nqpkv_16_m_16_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 256, 1, 16, 16, 0, false, false, kSM_89, xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_nqpkv_16_m_16_sm_89_cubin, xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_nqpkv_16_m_16_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 256, 1, 16, 16, 64, true, false, kSM_89, xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_pagedKV_64_nqpkv_16_m_16_sm_89_cubin, xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_pagedKV_64_nqpkv_16_m_16_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 256, 1, 16, 16, 128, true, false, kSM_89, xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_pagedKV_128_nqpkv_16_m_16_sm_89_cubin, xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_pagedKV_128_nqpkv_16_m_16_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_E4M3, 256, 1, 16, 16, 0, false, false, kSM_89, xqa_kernel_dt_bf16_d_256_beam_1_kvt_e4m3_nqpkv_16_m_16_sm_89_cubin, xqa_kernel_dt_bf16_d_256_beam_1_kvt_e4m3_nqpkv_16_m_16_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_E4M3, 256, 1, 16, 16, 64, true, false, kSM_89, xqa_kernel_dt_bf16_d_256_beam_1_kvt_e4m3_pagedKV_64_nqpkv_16_m_16_sm_89_cubin, xqa_kernel_dt_bf16_d_256_beam_1_kvt_e4m3_pagedKV_64_nqpkv_16_m_16_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_E4M3, 256, 1, 16, 16, 128, true, false, kSM_89, xqa_kernel_dt_bf16_d_256_beam_1_kvt_e4m3_pagedKV_128_nqpkv_16_m_16_sm_89_cubin, xqa_kernel_dt_bf16_d_256_beam_1_kvt_e4m3_pagedKV_128_nqpkv_16_m_16_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 256, 4, 1, 1, 64, true, false, kSM_89, xqa_kernel_dt_fp16_d_256_beam_4_kvt_fp16_pagedKV_64_nqpkv_1_m_1_sm_89_cubin, xqa_kernel_dt_fp16_d_256_beam_4_kvt_fp16_pagedKV_64_nqpkv_1_m_1_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 256, 4, 1, 1, 128, true, false, kSM_89, xqa_kernel_dt_fp16_d_256_beam_4_kvt_fp16_pagedKV_128_nqpkv_1_m_1_sm_89_cubin, xqa_kernel_dt_fp16_d_256_beam_4_kvt_fp16_pagedKV_128_nqpkv_1_m_1_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 256, 4, 1, 1, 64, true, false, kSM_89, xqa_kernel_dt_fp16_d_256_beam_4_kvt_int8_pagedKV_64_nqpkv_1_m_1_sm_89_cubin, xqa_kernel_dt_fp16_d_256_beam_4_kvt_int8_pagedKV_64_nqpkv_1_m_1_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 256, 4, 1, 1, 128, true, false, kSM_89, xqa_kernel_dt_fp16_d_256_beam_4_kvt_int8_pagedKV_128_nqpkv_1_m_1_sm_89_cubin, xqa_kernel_dt_fp16_d_256_beam_4_kvt_int8_pagedKV_128_nqpkv_1_m_1_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_E4M3, 256, 4, 1, 1, 64, true, false, kSM_89, xqa_kernel_dt_fp16_d_256_beam_4_kvt_e4m3_pagedKV_64_nqpkv_1_m_1_sm_89_cubin, xqa_kernel_dt_fp16_d_256_beam_4_kvt_e4m3_pagedKV_64_nqpkv_1_m_1_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_E4M3, 256, 4, 1, 1, 128, true, false, kSM_89, xqa_kernel_dt_fp16_d_256_beam_4_kvt_e4m3_pagedKV_128_nqpkv_1_m_1_sm_89_cubin, xqa_kernel_dt_fp16_d_256_beam_4_kvt_e4m3_pagedKV_128_nqpkv_1_m_1_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 256, 4, 1, 1, 64, true, false, kSM_89, xqa_kernel_dt_bf16_d_256_beam_4_kvt_bf16_pagedKV_64_nqpkv_1_m_1_sm_89_cubin, xqa_kernel_dt_bf16_d_256_beam_4_kvt_bf16_pagedKV_64_nqpkv_1_m_1_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 256, 4, 1, 1, 128, true, false, kSM_89, xqa_kernel_dt_bf16_d_256_beam_4_kvt_bf16_pagedKV_128_nqpkv_1_m_1_sm_89_cubin, xqa_kernel_dt_bf16_d_256_beam_4_kvt_bf16_pagedKV_128_nqpkv_1_m_1_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 256, 4, 1, 1, 64, true, false, kSM_89, xqa_kernel_dt_bf16_d_256_beam_4_kvt_int8_pagedKV_64_nqpkv_1_m_1_sm_89_cubin, xqa_kernel_dt_bf16_d_256_beam_4_kvt_int8_pagedKV_64_nqpkv_1_m_1_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 256, 4, 1, 1, 128, true, false, kSM_89, xqa_kernel_dt_bf16_d_256_beam_4_kvt_int8_pagedKV_128_nqpkv_1_m_1_sm_89_cubin, xqa_kernel_dt_bf16_d_256_beam_4_kvt_int8_pagedKV_128_nqpkv_1_m_1_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_E4M3, 256, 4, 1, 1, 64, true, false, kSM_89, xqa_kernel_dt_bf16_d_256_beam_4_kvt_e4m3_pagedKV_64_nqpkv_1_m_1_sm_89_cubin, xqa_kernel_dt_bf16_d_256_beam_4_kvt_e4m3_pagedKV_64_nqpkv_1_m_1_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_E4M3, 256, 4, 1, 1, 128, true, false, kSM_89, xqa_kernel_dt_bf16_d_256_beam_4_kvt_e4m3_pagedKV_128_nqpkv_1_m_1_sm_89_cubin, xqa_kernel_dt_bf16_d_256_beam_4_kvt_e4m3_pagedKV_128_nqpkv_1_m_1_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 64, 1, 0, 16, 64, true, true, kSM_89, xqa_kernel_dt_fp16_d_64_beam_1_kvt_fp16_pagedKV_64_nqpkv_0_m_16_sm_89_cubin, xqa_kernel_dt_fp16_d_64_beam_1_kvt_fp16_pagedKV_64_nqpkv_0_m_16_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 64, 1, 0, 32, 64, true, true, kSM_89, xqa_kernel_dt_fp16_d_64_beam_1_kvt_fp16_pagedKV_64_nqpkv_0_m_32_sm_89_cubin, xqa_kernel_dt_fp16_d_64_beam_1_kvt_fp16_pagedKV_64_nqpkv_0_m_32_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 64, 1, 0, 16, 64, true, true, kSM_89, xqa_kernel_dt_fp16_d_64_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_16_sm_89_cubin, xqa_kernel_dt_fp16_d_64_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_16_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 64, 1, 0, 32, 64, true, true, kSM_89, xqa_kernel_dt_fp16_d_64_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_32_sm_89_cubin, xqa_kernel_dt_fp16_d_64_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_32_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_E4M3, 64, 1, 0, 16, 64, true, true, kSM_89, xqa_kernel_dt_fp16_d_64_beam_1_kvt_e4m3_pagedKV_64_nqpkv_0_m_16_sm_89_cubin, xqa_kernel_dt_fp16_d_64_beam_1_kvt_e4m3_pagedKV_64_nqpkv_0_m_16_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_E4M3, 64, 1, 0, 32, 64, true, true, kSM_89, xqa_kernel_dt_fp16_d_64_beam_1_kvt_e4m3_pagedKV_64_nqpkv_0_m_32_sm_89_cubin, xqa_kernel_dt_fp16_d_64_beam_1_kvt_e4m3_pagedKV_64_nqpkv_0_m_32_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 1, 0, 16, 64, true, true, kSM_89, xqa_kernel_dt_bf16_d_64_beam_1_kvt_bf16_pagedKV_64_nqpkv_0_m_16_sm_89_cubin, xqa_kernel_dt_bf16_d_64_beam_1_kvt_bf16_pagedKV_64_nqpkv_0_m_16_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 1, 0, 32, 64, true, true, kSM_89, xqa_kernel_dt_bf16_d_64_beam_1_kvt_bf16_pagedKV_64_nqpkv_0_m_32_sm_89_cubin, xqa_kernel_dt_bf16_d_64_beam_1_kvt_bf16_pagedKV_64_nqpkv_0_m_32_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 64, 1, 0, 16, 64, true, true, kSM_89, xqa_kernel_dt_bf16_d_64_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_16_sm_89_cubin, xqa_kernel_dt_bf16_d_64_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_16_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 64, 1, 0, 32, 64, true, true, kSM_89, xqa_kernel_dt_bf16_d_64_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_32_sm_89_cubin, xqa_kernel_dt_bf16_d_64_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_32_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_E4M3, 64, 1, 0, 16, 64, true, true, kSM_89, xqa_kernel_dt_bf16_d_64_beam_1_kvt_e4m3_pagedKV_64_nqpkv_0_m_16_sm_89_cubin, xqa_kernel_dt_bf16_d_64_beam_1_kvt_e4m3_pagedKV_64_nqpkv_0_m_16_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_E4M3, 64, 1, 0, 32, 64, true, true, kSM_89, xqa_kernel_dt_bf16_d_64_beam_1_kvt_e4m3_pagedKV_64_nqpkv_0_m_32_sm_89_cubin, xqa_kernel_dt_bf16_d_64_beam_1_kvt_e4m3_pagedKV_64_nqpkv_0_m_32_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 1, 0, 16, 32, true, true, kSM_89, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_32_nqpkv_0_m_16_sm_89_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_32_nqpkv_0_m_16_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 1, 0, 32, 32, true, true, kSM_89, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_32_nqpkv_0_m_32_sm_89_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_32_nqpkv_0_m_32_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 128, 1, 0, 16, 32, true, true, kSM_89, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_16_sm_89_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_16_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 128, 1, 0, 32, 32, true, true, kSM_89, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_32_sm_89_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_32_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_E4M3, 128, 1, 0, 16, 32, true, true, kSM_89, xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_32_nqpkv_0_m_16_sm_89_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_32_nqpkv_0_m_16_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_E4M3, 128, 1, 0, 32, 32, true, true, kSM_89, xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_32_nqpkv_0_m_32_sm_89_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_32_nqpkv_0_m_32_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 1, 0, 16, 32, true, true, kSM_89, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_32_nqpkv_0_m_16_sm_89_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_32_nqpkv_0_m_16_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 1, 0, 32, 32, true, true, kSM_89, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_32_nqpkv_0_m_32_sm_89_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_32_nqpkv_0_m_32_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 128, 1, 0, 16, 32, true, true, kSM_89, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_16_sm_89_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_16_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 128, 1, 0, 32, 32, true, true, kSM_89, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_32_sm_89_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_32_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_E4M3, 128, 1, 0, 16, 32, true, true, kSM_89, xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_32_nqpkv_0_m_16_sm_89_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_32_nqpkv_0_m_16_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_E4M3, 128, 1, 0, 32, 32, true, true, kSM_89, xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_32_nqpkv_0_m_32_sm_89_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_32_nqpkv_0_m_32_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 64, 1, 0, 16, 32, true, true, kSM_89, xqa_kernel_dt_fp16_d_64_beam_1_kvt_fp16_pagedKV_32_nqpkv_0_m_16_sm_89_cubin, xqa_kernel_dt_fp16_d_64_beam_1_kvt_fp16_pagedKV_32_nqpkv_0_m_16_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 64, 1, 0, 32, 32, true, true, kSM_89, xqa_kernel_dt_fp16_d_64_beam_1_kvt_fp16_pagedKV_32_nqpkv_0_m_32_sm_89_cubin, xqa_kernel_dt_fp16_d_64_beam_1_kvt_fp16_pagedKV_32_nqpkv_0_m_32_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 64, 1, 0, 16, 32, true, true, kSM_89, xqa_kernel_dt_fp16_d_64_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_16_sm_89_cubin, xqa_kernel_dt_fp16_d_64_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_16_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 64, 1, 0, 32, 32, true, true, kSM_89, xqa_kernel_dt_fp16_d_64_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_32_sm_89_cubin, xqa_kernel_dt_fp16_d_64_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_32_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_E4M3, 64, 1, 0, 16, 32, true, true, kSM_89, xqa_kernel_dt_fp16_d_64_beam_1_kvt_e4m3_pagedKV_32_nqpkv_0_m_16_sm_89_cubin, xqa_kernel_dt_fp16_d_64_beam_1_kvt_e4m3_pagedKV_32_nqpkv_0_m_16_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_E4M3, 64, 1, 0, 32, 32, true, true, kSM_89, xqa_kernel_dt_fp16_d_64_beam_1_kvt_e4m3_pagedKV_32_nqpkv_0_m_32_sm_89_cubin, xqa_kernel_dt_fp16_d_64_beam_1_kvt_e4m3_pagedKV_32_nqpkv_0_m_32_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 1, 0, 16, 32, true, true, kSM_89, xqa_kernel_dt_bf16_d_64_beam_1_kvt_bf16_pagedKV_32_nqpkv_0_m_16_sm_89_cubin, xqa_kernel_dt_bf16_d_64_beam_1_kvt_bf16_pagedKV_32_nqpkv_0_m_16_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 1, 0, 32, 32, true, true, kSM_89, xqa_kernel_dt_bf16_d_64_beam_1_kvt_bf16_pagedKV_32_nqpkv_0_m_32_sm_89_cubin, xqa_kernel_dt_bf16_d_64_beam_1_kvt_bf16_pagedKV_32_nqpkv_0_m_32_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 64, 1, 0, 16, 32, true, true, kSM_89, xqa_kernel_dt_bf16_d_64_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_16_sm_89_cubin, xqa_kernel_dt_bf16_d_64_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_16_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 64, 1, 0, 32, 32, true, true, kSM_89, xqa_kernel_dt_bf16_d_64_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_32_sm_89_cubin, xqa_kernel_dt_bf16_d_64_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_32_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_E4M3, 64, 1, 0, 16, 32, true, true, kSM_89, xqa_kernel_dt_bf16_d_64_beam_1_kvt_e4m3_pagedKV_32_nqpkv_0_m_16_sm_89_cubin, xqa_kernel_dt_bf16_d_64_beam_1_kvt_e4m3_pagedKV_32_nqpkv_0_m_16_sm_89_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_E4M3, 64, 1, 0, 32, 32, true, true, kSM_89, xqa_kernel_dt_bf16_d_64_beam_1_kvt_e4m3_pagedKV_32_nqpkv_0_m_32_sm_89_cubin, xqa_kernel_dt_bf16_d_64_beam_1_kvt_e4m3_pagedKV_32_nqpkv_0_m_32_sm_89_cubin_len, "kernel_mha"}, -#endif -#ifndef EXCLUDE_SM_90 -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 1, 8, 8, 0, false, false, kSM_90, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_nqpkv_8_m_8_sm_90_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_nqpkv_8_m_8_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 1, 8, 8, 64, true, false, kSM_90, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_64_nqpkv_8_m_8_sm_90_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_64_nqpkv_8_m_8_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 1, 8, 8, 128, true, false, kSM_90, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_128_nqpkv_8_m_8_sm_90_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_128_nqpkv_8_m_8_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 128, 1, 8, 8, 0, false, false, kSM_90, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_nqpkv_8_m_8_sm_90_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_nqpkv_8_m_8_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 128, 1, 8, 8, 64, true, false, kSM_90, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_8_m_8_sm_90_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_8_m_8_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 128, 1, 8, 8, 128, true, false, kSM_90, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_8_m_8_sm_90_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_8_m_8_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_E4M3, 128, 1, 8, 8, 0, false, false, kSM_90, xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_nqpkv_8_m_8_sm_90_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_nqpkv_8_m_8_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_E4M3, 128, 1, 8, 8, 64, true, false, kSM_90, xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_64_nqpkv_8_m_8_sm_90_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_64_nqpkv_8_m_8_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_E4M3, 128, 1, 8, 8, 128, true, false, kSM_90, xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_128_nqpkv_8_m_8_sm_90_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_128_nqpkv_8_m_8_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 256, 1, 8, 8, 0, false, false, kSM_90, xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_nqpkv_8_m_8_sm_90_cubin, xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_nqpkv_8_m_8_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 256, 1, 8, 8, 64, true, false, kSM_90, xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_pagedKV_64_nqpkv_8_m_8_sm_90_cubin, xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_pagedKV_64_nqpkv_8_m_8_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 256, 1, 8, 8, 128, true, false, kSM_90, xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_pagedKV_128_nqpkv_8_m_8_sm_90_cubin, xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_pagedKV_128_nqpkv_8_m_8_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 256, 1, 8, 8, 0, false, false, kSM_90, xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_nqpkv_8_m_8_sm_90_cubin, xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_nqpkv_8_m_8_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 256, 1, 8, 8, 64, true, false, kSM_90, xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_pagedKV_64_nqpkv_8_m_8_sm_90_cubin, xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_pagedKV_64_nqpkv_8_m_8_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 256, 1, 8, 8, 128, true, false, kSM_90, xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_pagedKV_128_nqpkv_8_m_8_sm_90_cubin, xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_pagedKV_128_nqpkv_8_m_8_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_E4M3, 256, 1, 8, 8, 0, false, false, kSM_90, xqa_kernel_dt_fp16_d_256_beam_1_kvt_e4m3_nqpkv_8_m_8_sm_90_cubin, xqa_kernel_dt_fp16_d_256_beam_1_kvt_e4m3_nqpkv_8_m_8_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_E4M3, 256, 1, 8, 8, 64, true, false, kSM_90, xqa_kernel_dt_fp16_d_256_beam_1_kvt_e4m3_pagedKV_64_nqpkv_8_m_8_sm_90_cubin, xqa_kernel_dt_fp16_d_256_beam_1_kvt_e4m3_pagedKV_64_nqpkv_8_m_8_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_E4M3, 256, 1, 8, 8, 128, true, false, kSM_90, xqa_kernel_dt_fp16_d_256_beam_1_kvt_e4m3_pagedKV_128_nqpkv_8_m_8_sm_90_cubin, xqa_kernel_dt_fp16_d_256_beam_1_kvt_e4m3_pagedKV_128_nqpkv_8_m_8_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 1, 8, 8, 0, false, false, kSM_90, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_nqpkv_8_m_8_sm_90_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_nqpkv_8_m_8_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 1, 8, 8, 64, true, false, kSM_90, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_64_nqpkv_8_m_8_sm_90_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_64_nqpkv_8_m_8_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 1, 8, 8, 128, true, false, kSM_90, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_128_nqpkv_8_m_8_sm_90_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_128_nqpkv_8_m_8_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 128, 1, 8, 8, 0, false, false, kSM_90, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_nqpkv_8_m_8_sm_90_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_nqpkv_8_m_8_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 128, 1, 8, 8, 64, true, false, kSM_90, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_8_m_8_sm_90_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_8_m_8_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 128, 1, 8, 8, 128, true, false, kSM_90, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_8_m_8_sm_90_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_8_m_8_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_E4M3, 128, 1, 8, 8, 0, false, false, kSM_90, xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_nqpkv_8_m_8_sm_90_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_nqpkv_8_m_8_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_E4M3, 128, 1, 8, 8, 64, true, false, kSM_90, xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_64_nqpkv_8_m_8_sm_90_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_64_nqpkv_8_m_8_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_E4M3, 128, 1, 8, 8, 128, true, false, kSM_90, xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_128_nqpkv_8_m_8_sm_90_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_128_nqpkv_8_m_8_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 256, 1, 8, 8, 0, false, false, kSM_90, xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_nqpkv_8_m_8_sm_90_cubin, xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_nqpkv_8_m_8_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 256, 1, 8, 8, 64, true, false, kSM_90, xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_pagedKV_64_nqpkv_8_m_8_sm_90_cubin, xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_pagedKV_64_nqpkv_8_m_8_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 256, 1, 8, 8, 128, true, false, kSM_90, xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_pagedKV_128_nqpkv_8_m_8_sm_90_cubin, xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_pagedKV_128_nqpkv_8_m_8_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 256, 1, 8, 8, 0, false, false, kSM_90, xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_nqpkv_8_m_8_sm_90_cubin, xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_nqpkv_8_m_8_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 256, 1, 8, 8, 64, true, false, kSM_90, xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_pagedKV_64_nqpkv_8_m_8_sm_90_cubin, xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_pagedKV_64_nqpkv_8_m_8_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 256, 1, 8, 8, 128, true, false, kSM_90, xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_pagedKV_128_nqpkv_8_m_8_sm_90_cubin, xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_pagedKV_128_nqpkv_8_m_8_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_E4M3, 256, 1, 8, 8, 0, false, false, kSM_90, xqa_kernel_dt_bf16_d_256_beam_1_kvt_e4m3_nqpkv_8_m_8_sm_90_cubin, xqa_kernel_dt_bf16_d_256_beam_1_kvt_e4m3_nqpkv_8_m_8_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_E4M3, 256, 1, 8, 8, 64, true, false, kSM_90, xqa_kernel_dt_bf16_d_256_beam_1_kvt_e4m3_pagedKV_64_nqpkv_8_m_8_sm_90_cubin, xqa_kernel_dt_bf16_d_256_beam_1_kvt_e4m3_pagedKV_64_nqpkv_8_m_8_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_E4M3, 256, 1, 8, 8, 128, true, false, kSM_90, xqa_kernel_dt_bf16_d_256_beam_1_kvt_e4m3_pagedKV_128_nqpkv_8_m_8_sm_90_cubin, xqa_kernel_dt_bf16_d_256_beam_1_kvt_e4m3_pagedKV_128_nqpkv_8_m_8_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 1, 16, 16, 0, false, false, kSM_90, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_nqpkv_16_m_16_sm_90_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_nqpkv_16_m_16_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 1, 16, 16, 64, true, false, kSM_90, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_64_nqpkv_16_m_16_sm_90_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_64_nqpkv_16_m_16_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 1, 16, 16, 128, true, false, kSM_90, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_128_nqpkv_16_m_16_sm_90_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_128_nqpkv_16_m_16_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 128, 1, 16, 16, 0, false, false, kSM_90, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_nqpkv_16_m_16_sm_90_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_nqpkv_16_m_16_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 128, 1, 16, 16, 64, true, false, kSM_90, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_16_m_16_sm_90_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_16_m_16_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 128, 1, 16, 16, 128, true, false, kSM_90, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_16_m_16_sm_90_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_16_m_16_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_E4M3, 128, 1, 16, 16, 0, false, false, kSM_90, xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_nqpkv_16_m_16_sm_90_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_nqpkv_16_m_16_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_E4M3, 128, 1, 16, 16, 64, true, false, kSM_90, xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_64_nqpkv_16_m_16_sm_90_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_64_nqpkv_16_m_16_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_E4M3, 128, 1, 16, 16, 128, true, false, kSM_90, xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_128_nqpkv_16_m_16_sm_90_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_128_nqpkv_16_m_16_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 256, 1, 16, 16, 0, false, false, kSM_90, xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_nqpkv_16_m_16_sm_90_cubin, xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_nqpkv_16_m_16_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 256, 1, 16, 16, 64, true, false, kSM_90, xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_pagedKV_64_nqpkv_16_m_16_sm_90_cubin, xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_pagedKV_64_nqpkv_16_m_16_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 256, 1, 16, 16, 128, true, false, kSM_90, xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_pagedKV_128_nqpkv_16_m_16_sm_90_cubin, xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_pagedKV_128_nqpkv_16_m_16_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 256, 1, 16, 16, 0, false, false, kSM_90, xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_nqpkv_16_m_16_sm_90_cubin, xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_nqpkv_16_m_16_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 256, 1, 16, 16, 64, true, false, kSM_90, xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_pagedKV_64_nqpkv_16_m_16_sm_90_cubin, xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_pagedKV_64_nqpkv_16_m_16_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 256, 1, 16, 16, 128, true, false, kSM_90, xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_pagedKV_128_nqpkv_16_m_16_sm_90_cubin, xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_pagedKV_128_nqpkv_16_m_16_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_E4M3, 256, 1, 16, 16, 0, false, false, kSM_90, xqa_kernel_dt_fp16_d_256_beam_1_kvt_e4m3_nqpkv_16_m_16_sm_90_cubin, xqa_kernel_dt_fp16_d_256_beam_1_kvt_e4m3_nqpkv_16_m_16_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_E4M3, 256, 1, 16, 16, 64, true, false, kSM_90, xqa_kernel_dt_fp16_d_256_beam_1_kvt_e4m3_pagedKV_64_nqpkv_16_m_16_sm_90_cubin, xqa_kernel_dt_fp16_d_256_beam_1_kvt_e4m3_pagedKV_64_nqpkv_16_m_16_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_E4M3, 256, 1, 16, 16, 128, true, false, kSM_90, xqa_kernel_dt_fp16_d_256_beam_1_kvt_e4m3_pagedKV_128_nqpkv_16_m_16_sm_90_cubin, xqa_kernel_dt_fp16_d_256_beam_1_kvt_e4m3_pagedKV_128_nqpkv_16_m_16_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 1, 16, 16, 0, false, false, kSM_90, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_nqpkv_16_m_16_sm_90_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_nqpkv_16_m_16_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 1, 16, 16, 64, true, false, kSM_90, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_64_nqpkv_16_m_16_sm_90_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_64_nqpkv_16_m_16_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 1, 16, 16, 128, true, false, kSM_90, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_128_nqpkv_16_m_16_sm_90_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_128_nqpkv_16_m_16_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 128, 1, 16, 16, 0, false, false, kSM_90, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_nqpkv_16_m_16_sm_90_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_nqpkv_16_m_16_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 128, 1, 16, 16, 64, true, false, kSM_90, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_16_m_16_sm_90_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_16_m_16_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 128, 1, 16, 16, 128, true, false, kSM_90, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_16_m_16_sm_90_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_16_m_16_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_E4M3, 128, 1, 16, 16, 0, false, false, kSM_90, xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_nqpkv_16_m_16_sm_90_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_nqpkv_16_m_16_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_E4M3, 128, 1, 16, 16, 64, true, false, kSM_90, xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_64_nqpkv_16_m_16_sm_90_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_64_nqpkv_16_m_16_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_E4M3, 128, 1, 16, 16, 128, true, false, kSM_90, xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_128_nqpkv_16_m_16_sm_90_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_128_nqpkv_16_m_16_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 256, 1, 16, 16, 0, false, false, kSM_90, xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_nqpkv_16_m_16_sm_90_cubin, xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_nqpkv_16_m_16_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 256, 1, 16, 16, 64, true, false, kSM_90, xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_pagedKV_64_nqpkv_16_m_16_sm_90_cubin, xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_pagedKV_64_nqpkv_16_m_16_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 256, 1, 16, 16, 128, true, false, kSM_90, xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_pagedKV_128_nqpkv_16_m_16_sm_90_cubin, xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_pagedKV_128_nqpkv_16_m_16_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 256, 1, 16, 16, 0, false, false, kSM_90, xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_nqpkv_16_m_16_sm_90_cubin, xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_nqpkv_16_m_16_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 256, 1, 16, 16, 64, true, false, kSM_90, xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_pagedKV_64_nqpkv_16_m_16_sm_90_cubin, xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_pagedKV_64_nqpkv_16_m_16_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 256, 1, 16, 16, 128, true, false, kSM_90, xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_pagedKV_128_nqpkv_16_m_16_sm_90_cubin, xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_pagedKV_128_nqpkv_16_m_16_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_E4M3, 256, 1, 16, 16, 0, false, false, kSM_90, xqa_kernel_dt_bf16_d_256_beam_1_kvt_e4m3_nqpkv_16_m_16_sm_90_cubin, xqa_kernel_dt_bf16_d_256_beam_1_kvt_e4m3_nqpkv_16_m_16_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_E4M3, 256, 1, 16, 16, 64, true, false, kSM_90, xqa_kernel_dt_bf16_d_256_beam_1_kvt_e4m3_pagedKV_64_nqpkv_16_m_16_sm_90_cubin, xqa_kernel_dt_bf16_d_256_beam_1_kvt_e4m3_pagedKV_64_nqpkv_16_m_16_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_E4M3, 256, 1, 16, 16, 128, true, false, kSM_90, xqa_kernel_dt_bf16_d_256_beam_1_kvt_e4m3_pagedKV_128_nqpkv_16_m_16_sm_90_cubin, xqa_kernel_dt_bf16_d_256_beam_1_kvt_e4m3_pagedKV_128_nqpkv_16_m_16_sm_90_cubin_len, "kernel_mha"}, -// MultiQueryToken kernels. -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 1, 0, 16, 0, false, true, kSM_90, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_nqpkv_0_m_16_sm_90_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_nqpkv_0_m_16_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 1, 0, 32, 0, false, true, kSM_90, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_nqpkv_0_m_32_sm_90_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_nqpkv_0_m_32_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 1, 0, 16, 64, true, true, kSM_90, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_64_nqpkv_0_m_16_sm_90_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_64_nqpkv_0_m_16_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 1, 0, 32, 64, true, true, kSM_90, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_64_nqpkv_0_m_32_sm_90_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_64_nqpkv_0_m_32_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 1, 0, 16, 128, true, true, kSM_90, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_128_nqpkv_0_m_16_sm_90_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_128_nqpkv_0_m_16_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 1, 0, 32, 128, true, true, kSM_90, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_128_nqpkv_0_m_32_sm_90_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_128_nqpkv_0_m_32_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 128, 1, 0, 16, 0, false, true, kSM_90, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_nqpkv_0_m_16_sm_90_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_nqpkv_0_m_16_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 128, 1, 0, 32, 0, false, true, kSM_90, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_nqpkv_0_m_32_sm_90_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_nqpkv_0_m_32_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 128, 1, 0, 16, 64, true, true, kSM_90, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_16_sm_90_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_16_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 128, 1, 0, 32, 64, true, true, kSM_90, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_32_sm_90_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_32_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 128, 1, 0, 16, 128, true, true, kSM_90, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_0_m_16_sm_90_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_0_m_16_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 128, 1, 0, 32, 128, true, true, kSM_90, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_0_m_32_sm_90_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_0_m_32_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_E4M3, 128, 1, 0, 16, 0, false, true, kSM_90, xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_nqpkv_0_m_16_sm_90_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_nqpkv_0_m_16_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_E4M3, 128, 1, 0, 32, 0, false, true, kSM_90, xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_nqpkv_0_m_32_sm_90_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_nqpkv_0_m_32_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_E4M3, 128, 1, 0, 16, 64, true, true, kSM_90, xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_64_nqpkv_0_m_16_sm_90_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_64_nqpkv_0_m_16_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_E4M3, 128, 1, 0, 32, 64, true, true, kSM_90, xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_64_nqpkv_0_m_32_sm_90_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_64_nqpkv_0_m_32_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_E4M3, 128, 1, 0, 16, 128, true, true, kSM_90, xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_128_nqpkv_0_m_16_sm_90_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_128_nqpkv_0_m_16_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_E4M3, 128, 1, 0, 32, 128, true, true, kSM_90, xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_128_nqpkv_0_m_32_sm_90_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_128_nqpkv_0_m_32_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 1, 0, 16, 0, false, true, kSM_90, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_nqpkv_0_m_16_sm_90_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_nqpkv_0_m_16_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 1, 0, 32, 0, false, true, kSM_90, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_nqpkv_0_m_32_sm_90_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_nqpkv_0_m_32_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 1, 0, 16, 64, true, true, kSM_90, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_64_nqpkv_0_m_16_sm_90_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_64_nqpkv_0_m_16_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 1, 0, 32, 64, true, true, kSM_90, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_64_nqpkv_0_m_32_sm_90_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_64_nqpkv_0_m_32_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 1, 0, 16, 128, true, true, kSM_90, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_128_nqpkv_0_m_16_sm_90_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_128_nqpkv_0_m_16_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 1, 0, 32, 128, true, true, kSM_90, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_128_nqpkv_0_m_32_sm_90_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_128_nqpkv_0_m_32_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 128, 1, 0, 16, 0, false, true, kSM_90, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_nqpkv_0_m_16_sm_90_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_nqpkv_0_m_16_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 128, 1, 0, 32, 0, false, true, kSM_90, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_nqpkv_0_m_32_sm_90_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_nqpkv_0_m_32_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 128, 1, 0, 16, 64, true, true, kSM_90, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_16_sm_90_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_16_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 128, 1, 0, 32, 64, true, true, kSM_90, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_32_sm_90_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_32_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 128, 1, 0, 16, 128, true, true, kSM_90, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_0_m_16_sm_90_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_0_m_16_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 128, 1, 0, 32, 128, true, true, kSM_90, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_0_m_32_sm_90_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_0_m_32_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_E4M3, 128, 1, 0, 16, 0, false, true, kSM_90, xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_nqpkv_0_m_16_sm_90_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_nqpkv_0_m_16_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_E4M3, 128, 1, 0, 32, 0, false, true, kSM_90, xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_nqpkv_0_m_32_sm_90_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_nqpkv_0_m_32_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_E4M3, 128, 1, 0, 16, 64, true, true, kSM_90, xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_64_nqpkv_0_m_16_sm_90_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_64_nqpkv_0_m_16_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_E4M3, 128, 1, 0, 32, 64, true, true, kSM_90, xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_64_nqpkv_0_m_32_sm_90_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_64_nqpkv_0_m_32_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_E4M3, 128, 1, 0, 16, 128, true, true, kSM_90, xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_128_nqpkv_0_m_16_sm_90_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_128_nqpkv_0_m_16_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_E4M3, 128, 1, 0, 32, 128, true, true, kSM_90, xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_128_nqpkv_0_m_32_sm_90_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_128_nqpkv_0_m_32_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 64, 1, 0, 16, 64, true, true, kSM_90, xqa_kernel_dt_fp16_d_64_beam_1_kvt_fp16_pagedKV_64_nqpkv_0_m_16_sm_90_cubin, xqa_kernel_dt_fp16_d_64_beam_1_kvt_fp16_pagedKV_64_nqpkv_0_m_16_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 64, 1, 0, 32, 64, true, true, kSM_90, xqa_kernel_dt_fp16_d_64_beam_1_kvt_fp16_pagedKV_64_nqpkv_0_m_32_sm_90_cubin, xqa_kernel_dt_fp16_d_64_beam_1_kvt_fp16_pagedKV_64_nqpkv_0_m_32_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 64, 1, 0, 16, 64, true, true, kSM_90, xqa_kernel_dt_fp16_d_64_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_16_sm_90_cubin, xqa_kernel_dt_fp16_d_64_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_16_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 64, 1, 0, 32, 64, true, true, kSM_90, xqa_kernel_dt_fp16_d_64_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_32_sm_90_cubin, xqa_kernel_dt_fp16_d_64_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_32_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_E4M3, 64, 1, 0, 16, 64, true, true, kSM_90, xqa_kernel_dt_fp16_d_64_beam_1_kvt_e4m3_pagedKV_64_nqpkv_0_m_16_sm_90_cubin, xqa_kernel_dt_fp16_d_64_beam_1_kvt_e4m3_pagedKV_64_nqpkv_0_m_16_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_E4M3, 64, 1, 0, 32, 64, true, true, kSM_90, xqa_kernel_dt_fp16_d_64_beam_1_kvt_e4m3_pagedKV_64_nqpkv_0_m_32_sm_90_cubin, xqa_kernel_dt_fp16_d_64_beam_1_kvt_e4m3_pagedKV_64_nqpkv_0_m_32_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 1, 0, 16, 64, true, true, kSM_90, xqa_kernel_dt_bf16_d_64_beam_1_kvt_bf16_pagedKV_64_nqpkv_0_m_16_sm_90_cubin, xqa_kernel_dt_bf16_d_64_beam_1_kvt_bf16_pagedKV_64_nqpkv_0_m_16_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 1, 0, 32, 64, true, true, kSM_90, xqa_kernel_dt_bf16_d_64_beam_1_kvt_bf16_pagedKV_64_nqpkv_0_m_32_sm_90_cubin, xqa_kernel_dt_bf16_d_64_beam_1_kvt_bf16_pagedKV_64_nqpkv_0_m_32_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 64, 1, 0, 16, 64, true, true, kSM_90, xqa_kernel_dt_bf16_d_64_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_16_sm_90_cubin, xqa_kernel_dt_bf16_d_64_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_16_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 64, 1, 0, 32, 64, true, true, kSM_90, xqa_kernel_dt_bf16_d_64_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_32_sm_90_cubin, xqa_kernel_dt_bf16_d_64_beam_1_kvt_int8_pagedKV_64_nqpkv_0_m_32_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_E4M3, 64, 1, 0, 16, 64, true, true, kSM_90, xqa_kernel_dt_bf16_d_64_beam_1_kvt_e4m3_pagedKV_64_nqpkv_0_m_16_sm_90_cubin, xqa_kernel_dt_bf16_d_64_beam_1_kvt_e4m3_pagedKV_64_nqpkv_0_m_16_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_E4M3, 64, 1, 0, 32, 64, true, true, kSM_90, xqa_kernel_dt_bf16_d_64_beam_1_kvt_e4m3_pagedKV_64_nqpkv_0_m_32_sm_90_cubin, xqa_kernel_dt_bf16_d_64_beam_1_kvt_e4m3_pagedKV_64_nqpkv_0_m_32_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 1, 0, 16, 32, true, true, kSM_90, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_32_nqpkv_0_m_16_sm_90_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_32_nqpkv_0_m_16_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 1, 0, 32, 32, true, true, kSM_90, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_32_nqpkv_0_m_32_sm_90_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_32_nqpkv_0_m_32_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 128, 1, 0, 16, 32, true, true, kSM_90, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_16_sm_90_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_16_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 128, 1, 0, 32, 32, true, true, kSM_90, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_32_sm_90_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_32_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_E4M3, 128, 1, 0, 16, 32, true, true, kSM_90, xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_32_nqpkv_0_m_16_sm_90_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_32_nqpkv_0_m_16_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_E4M3, 128, 1, 0, 32, 32, true, true, kSM_90, xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_32_nqpkv_0_m_32_sm_90_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_32_nqpkv_0_m_32_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 1, 0, 16, 32, true, true, kSM_90, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_32_nqpkv_0_m_16_sm_90_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_32_nqpkv_0_m_16_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 1, 0, 32, 32, true, true, kSM_90, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_32_nqpkv_0_m_32_sm_90_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_32_nqpkv_0_m_32_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 128, 1, 0, 16, 32, true, true, kSM_90, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_16_sm_90_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_16_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 128, 1, 0, 32, 32, true, true, kSM_90, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_32_sm_90_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_32_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_E4M3, 128, 1, 0, 16, 32, true, true, kSM_90, xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_32_nqpkv_0_m_16_sm_90_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_32_nqpkv_0_m_16_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_E4M3, 128, 1, 0, 32, 32, true, true, kSM_90, xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_32_nqpkv_0_m_32_sm_90_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_32_nqpkv_0_m_32_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 64, 1, 0, 16, 32, true, true, kSM_90, xqa_kernel_dt_fp16_d_64_beam_1_kvt_fp16_pagedKV_32_nqpkv_0_m_16_sm_90_cubin, xqa_kernel_dt_fp16_d_64_beam_1_kvt_fp16_pagedKV_32_nqpkv_0_m_16_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 64, 1, 0, 32, 32, true, true, kSM_90, xqa_kernel_dt_fp16_d_64_beam_1_kvt_fp16_pagedKV_32_nqpkv_0_m_32_sm_90_cubin, xqa_kernel_dt_fp16_d_64_beam_1_kvt_fp16_pagedKV_32_nqpkv_0_m_32_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 64, 1, 0, 16, 32, true, true, kSM_90, xqa_kernel_dt_fp16_d_64_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_16_sm_90_cubin, xqa_kernel_dt_fp16_d_64_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_16_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 64, 1, 0, 32, 32, true, true, kSM_90, xqa_kernel_dt_fp16_d_64_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_32_sm_90_cubin, xqa_kernel_dt_fp16_d_64_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_32_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_E4M3, 64, 1, 0, 16, 32, true, true, kSM_90, xqa_kernel_dt_fp16_d_64_beam_1_kvt_e4m3_pagedKV_32_nqpkv_0_m_16_sm_90_cubin, xqa_kernel_dt_fp16_d_64_beam_1_kvt_e4m3_pagedKV_32_nqpkv_0_m_16_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_E4M3, 64, 1, 0, 32, 32, true, true, kSM_90, xqa_kernel_dt_fp16_d_64_beam_1_kvt_e4m3_pagedKV_32_nqpkv_0_m_32_sm_90_cubin, xqa_kernel_dt_fp16_d_64_beam_1_kvt_e4m3_pagedKV_32_nqpkv_0_m_32_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 1, 0, 16, 32, true, true, kSM_90, xqa_kernel_dt_bf16_d_64_beam_1_kvt_bf16_pagedKV_32_nqpkv_0_m_16_sm_90_cubin, xqa_kernel_dt_bf16_d_64_beam_1_kvt_bf16_pagedKV_32_nqpkv_0_m_16_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 1, 0, 32, 32, true, true, kSM_90, xqa_kernel_dt_bf16_d_64_beam_1_kvt_bf16_pagedKV_32_nqpkv_0_m_32_sm_90_cubin, xqa_kernel_dt_bf16_d_64_beam_1_kvt_bf16_pagedKV_32_nqpkv_0_m_32_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 64, 1, 0, 16, 32, true, true, kSM_90, xqa_kernel_dt_bf16_d_64_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_16_sm_90_cubin, xqa_kernel_dt_bf16_d_64_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_16_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 64, 1, 0, 32, 32, true, true, kSM_90, xqa_kernel_dt_bf16_d_64_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_32_sm_90_cubin, xqa_kernel_dt_bf16_d_64_beam_1_kvt_int8_pagedKV_32_nqpkv_0_m_32_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_E4M3, 64, 1, 0, 16, 32, true, true, kSM_90, xqa_kernel_dt_bf16_d_64_beam_1_kvt_e4m3_pagedKV_32_nqpkv_0_m_16_sm_90_cubin, xqa_kernel_dt_bf16_d_64_beam_1_kvt_e4m3_pagedKV_32_nqpkv_0_m_16_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_E4M3, 64, 1, 0, 32, 32, true, true, kSM_90, xqa_kernel_dt_bf16_d_64_beam_1_kvt_e4m3_pagedKV_32_nqpkv_0_m_32_sm_90_cubin, xqa_kernel_dt_bf16_d_64_beam_1_kvt_e4m3_pagedKV_32_nqpkv_0_m_32_sm_90_cubin_len, "kernel_mha"}, -// MHA with beamWidth=4 -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 256, 4, 1, 1, 64, true, false, kSM_90, xqa_kernel_dt_fp16_d_256_beam_4_kvt_fp16_pagedKV_64_nqpkv_1_m_1_sm_90_cubin, xqa_kernel_dt_fp16_d_256_beam_4_kvt_fp16_pagedKV_64_nqpkv_1_m_1_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 256, 4, 1, 1, 128, true, false, kSM_90, xqa_kernel_dt_fp16_d_256_beam_4_kvt_fp16_pagedKV_128_nqpkv_1_m_1_sm_90_cubin, xqa_kernel_dt_fp16_d_256_beam_4_kvt_fp16_pagedKV_128_nqpkv_1_m_1_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 256, 4, 1, 1, 64, true, false, kSM_90, xqa_kernel_dt_fp16_d_256_beam_4_kvt_int8_pagedKV_64_nqpkv_1_m_1_sm_90_cubin, xqa_kernel_dt_fp16_d_256_beam_4_kvt_int8_pagedKV_64_nqpkv_1_m_1_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 256, 4, 1, 1, 128, true, false, kSM_90, xqa_kernel_dt_fp16_d_256_beam_4_kvt_int8_pagedKV_128_nqpkv_1_m_1_sm_90_cubin, xqa_kernel_dt_fp16_d_256_beam_4_kvt_int8_pagedKV_128_nqpkv_1_m_1_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_E4M3, 256, 4, 1, 1, 64, true, false, kSM_90, xqa_kernel_dt_fp16_d_256_beam_4_kvt_e4m3_pagedKV_64_nqpkv_1_m_1_sm_90_cubin, xqa_kernel_dt_fp16_d_256_beam_4_kvt_e4m3_pagedKV_64_nqpkv_1_m_1_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_E4M3, 256, 4, 1, 1, 128, true, false, kSM_90, xqa_kernel_dt_fp16_d_256_beam_4_kvt_e4m3_pagedKV_128_nqpkv_1_m_1_sm_90_cubin, xqa_kernel_dt_fp16_d_256_beam_4_kvt_e4m3_pagedKV_128_nqpkv_1_m_1_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 256, 4, 1, 1, 64, true, false, kSM_90, xqa_kernel_dt_bf16_d_256_beam_4_kvt_bf16_pagedKV_64_nqpkv_1_m_1_sm_90_cubin, xqa_kernel_dt_bf16_d_256_beam_4_kvt_bf16_pagedKV_64_nqpkv_1_m_1_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 256, 4, 1, 1, 128, true, false, kSM_90, xqa_kernel_dt_bf16_d_256_beam_4_kvt_bf16_pagedKV_128_nqpkv_1_m_1_sm_90_cubin, xqa_kernel_dt_bf16_d_256_beam_4_kvt_bf16_pagedKV_128_nqpkv_1_m_1_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 256, 4, 1, 1, 64, true, false, kSM_90, xqa_kernel_dt_bf16_d_256_beam_4_kvt_int8_pagedKV_64_nqpkv_1_m_1_sm_90_cubin, xqa_kernel_dt_bf16_d_256_beam_4_kvt_int8_pagedKV_64_nqpkv_1_m_1_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 256, 4, 1, 1, 128, true, false, kSM_90, xqa_kernel_dt_bf16_d_256_beam_4_kvt_int8_pagedKV_128_nqpkv_1_m_1_sm_90_cubin, xqa_kernel_dt_bf16_d_256_beam_4_kvt_int8_pagedKV_128_nqpkv_1_m_1_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_E4M3, 256, 4, 1, 1, 64, true, false, kSM_90, xqa_kernel_dt_bf16_d_256_beam_4_kvt_e4m3_pagedKV_64_nqpkv_1_m_1_sm_90_cubin, xqa_kernel_dt_bf16_d_256_beam_4_kvt_e4m3_pagedKV_64_nqpkv_1_m_1_sm_90_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_E4M3, 256, 4, 1, 1, 128, true, false, kSM_90, xqa_kernel_dt_bf16_d_256_beam_4_kvt_e4m3_pagedKV_128_nqpkv_1_m_1_sm_90_cubin, xqa_kernel_dt_bf16_d_256_beam_4_kvt_e4m3_pagedKV_128_nqpkv_1_m_1_sm_90_cubin_len, "kernel_mha"}, -#endif -#ifndef EXCLUDE_SM_120 -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 1, 8, 8, 0, false, false, kSM_120, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_nqpkv_8_m_8_sm_120_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_nqpkv_8_m_8_sm_120_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 1, 8, 8, 16, true, false, kSM_120, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_16_nqpkv_8_m_8_sm_120_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_16_nqpkv_8_m_8_sm_120_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 1, 8, 8, 32, true, false, kSM_120, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_32_nqpkv_8_m_8_sm_120_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_32_nqpkv_8_m_8_sm_120_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 1, 8, 8, 64, true, false, kSM_120, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_64_nqpkv_8_m_8_sm_120_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_64_nqpkv_8_m_8_sm_120_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 1, 8, 8, 128, true, false, kSM_120, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_128_nqpkv_8_m_8_sm_120_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_fp16_pagedKV_128_nqpkv_8_m_8_sm_120_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 128, 1, 8, 8, 0, false, false, kSM_120, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_nqpkv_8_m_8_sm_120_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_nqpkv_8_m_8_sm_120_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 128, 1, 8, 8, 16, true, false, kSM_120, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_16_nqpkv_8_m_8_sm_120_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_16_nqpkv_8_m_8_sm_120_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 128, 1, 8, 8, 32, true, false, kSM_120, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_32_nqpkv_8_m_8_sm_120_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_32_nqpkv_8_m_8_sm_120_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 128, 1, 8, 8, 64, true, false, kSM_120, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_8_m_8_sm_120_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_8_m_8_sm_120_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 128, 1, 8, 8, 128, true, false, kSM_120, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_8_m_8_sm_120_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_8_m_8_sm_120_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_E4M3, 128, 1, 8, 8, 0, false, false, kSM_120, xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_nqpkv_8_m_8_sm_120_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_nqpkv_8_m_8_sm_120_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_E4M3, 128, 1, 8, 8, 16, true, false, kSM_120, xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_16_nqpkv_8_m_8_sm_120_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_16_nqpkv_8_m_8_sm_120_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_E4M3, 128, 1, 8, 8, 32, true, false, kSM_120, xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_32_nqpkv_8_m_8_sm_120_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_32_nqpkv_8_m_8_sm_120_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_E4M3, 128, 1, 8, 8, 64, true, false, kSM_120, xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_64_nqpkv_8_m_8_sm_120_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_64_nqpkv_8_m_8_sm_120_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_E4M3, 128, 1, 8, 8, 128, true, false, kSM_120, xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_128_nqpkv_8_m_8_sm_120_cubin, xqa_kernel_dt_fp16_d_128_beam_1_kvt_e4m3_pagedKV_128_nqpkv_8_m_8_sm_120_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 256, 1, 8, 8, 0, false, false, kSM_120, xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_nqpkv_8_m_8_sm_120_cubin, xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_nqpkv_8_m_8_sm_120_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 256, 1, 8, 8, 16, true, false, kSM_120, xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_pagedKV_16_nqpkv_8_m_8_sm_120_cubin, xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_pagedKV_16_nqpkv_8_m_8_sm_120_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 256, 1, 8, 8, 32, true, false, kSM_120, xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_pagedKV_32_nqpkv_8_m_8_sm_120_cubin, xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_pagedKV_32_nqpkv_8_m_8_sm_120_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 256, 1, 8, 8, 64, true, false, kSM_120, xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_pagedKV_64_nqpkv_8_m_8_sm_120_cubin, xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_pagedKV_64_nqpkv_8_m_8_sm_120_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 256, 1, 8, 8, 128, true, false, kSM_120, xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_pagedKV_128_nqpkv_8_m_8_sm_120_cubin, xqa_kernel_dt_fp16_d_256_beam_1_kvt_fp16_pagedKV_128_nqpkv_8_m_8_sm_120_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 256, 1, 8, 8, 0, false, false, kSM_120, xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_nqpkv_8_m_8_sm_120_cubin, xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_nqpkv_8_m_8_sm_120_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 256, 1, 8, 8, 16, true, false, kSM_120, xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_pagedKV_16_nqpkv_8_m_8_sm_120_cubin, xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_pagedKV_16_nqpkv_8_m_8_sm_120_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 256, 1, 8, 8, 32, true, false, kSM_120, xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_pagedKV_32_nqpkv_8_m_8_sm_120_cubin, xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_pagedKV_32_nqpkv_8_m_8_sm_120_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 256, 1, 8, 8, 64, true, false, kSM_120, xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_pagedKV_64_nqpkv_8_m_8_sm_120_cubin, xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_pagedKV_64_nqpkv_8_m_8_sm_120_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 256, 1, 8, 8, 128, true, false, kSM_120, xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_pagedKV_128_nqpkv_8_m_8_sm_120_cubin, xqa_kernel_dt_fp16_d_256_beam_1_kvt_int8_pagedKV_128_nqpkv_8_m_8_sm_120_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_E4M3, 256, 1, 8, 8, 0, false, false, kSM_120, xqa_kernel_dt_fp16_d_256_beam_1_kvt_e4m3_nqpkv_8_m_8_sm_120_cubin, xqa_kernel_dt_fp16_d_256_beam_1_kvt_e4m3_nqpkv_8_m_8_sm_120_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_E4M3, 256, 1, 8, 8, 16, true, false, kSM_120, xqa_kernel_dt_fp16_d_256_beam_1_kvt_e4m3_pagedKV_16_nqpkv_8_m_8_sm_120_cubin, xqa_kernel_dt_fp16_d_256_beam_1_kvt_e4m3_pagedKV_16_nqpkv_8_m_8_sm_120_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_E4M3, 256, 1, 8, 8, 32, true, false, kSM_120, xqa_kernel_dt_fp16_d_256_beam_1_kvt_e4m3_pagedKV_32_nqpkv_8_m_8_sm_120_cubin, xqa_kernel_dt_fp16_d_256_beam_1_kvt_e4m3_pagedKV_32_nqpkv_8_m_8_sm_120_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_E4M3, 256, 1, 8, 8, 64, true, false, kSM_120, xqa_kernel_dt_fp16_d_256_beam_1_kvt_e4m3_pagedKV_64_nqpkv_8_m_8_sm_120_cubin, xqa_kernel_dt_fp16_d_256_beam_1_kvt_e4m3_pagedKV_64_nqpkv_8_m_8_sm_120_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_E4M3, 256, 1, 8, 8, 128, true, false, kSM_120, xqa_kernel_dt_fp16_d_256_beam_1_kvt_e4m3_pagedKV_128_nqpkv_8_m_8_sm_120_cubin, xqa_kernel_dt_fp16_d_256_beam_1_kvt_e4m3_pagedKV_128_nqpkv_8_m_8_sm_120_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 1, 8, 8, 0, false, false, kSM_120, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_nqpkv_8_m_8_sm_120_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_nqpkv_8_m_8_sm_120_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 1, 8, 8, 16, true, false, kSM_120, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_16_nqpkv_8_m_8_sm_120_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_16_nqpkv_8_m_8_sm_120_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 1, 8, 8, 32, true, false, kSM_120, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_32_nqpkv_8_m_8_sm_120_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_32_nqpkv_8_m_8_sm_120_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 1, 8, 8, 64, true, false, kSM_120, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_64_nqpkv_8_m_8_sm_120_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_64_nqpkv_8_m_8_sm_120_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 1, 8, 8, 128, true, false, kSM_120, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_128_nqpkv_8_m_8_sm_120_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_bf16_pagedKV_128_nqpkv_8_m_8_sm_120_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 128, 1, 8, 8, 0, false, false, kSM_120, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_nqpkv_8_m_8_sm_120_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_nqpkv_8_m_8_sm_120_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 128, 1, 8, 8, 16, true, false, kSM_120, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_16_nqpkv_8_m_8_sm_120_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_16_nqpkv_8_m_8_sm_120_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 128, 1, 8, 8, 32, true, false, kSM_120, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_32_nqpkv_8_m_8_sm_120_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_32_nqpkv_8_m_8_sm_120_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 128, 1, 8, 8, 64, true, false, kSM_120, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_8_m_8_sm_120_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_64_nqpkv_8_m_8_sm_120_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 128, 1, 8, 8, 128, true, false, kSM_120, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_8_m_8_sm_120_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_int8_pagedKV_128_nqpkv_8_m_8_sm_120_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_E4M3, 128, 1, 8, 8, 0, false, false, kSM_120, xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_nqpkv_8_m_8_sm_120_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_nqpkv_8_m_8_sm_120_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_E4M3, 128, 1, 8, 8, 16, true, false, kSM_120, xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_16_nqpkv_8_m_8_sm_120_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_16_nqpkv_8_m_8_sm_120_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_E4M3, 128, 1, 8, 8, 32, true, false, kSM_120, xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_32_nqpkv_8_m_8_sm_120_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_32_nqpkv_8_m_8_sm_120_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_E4M3, 128, 1, 8, 8, 64, true, false, kSM_120, xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_64_nqpkv_8_m_8_sm_120_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_64_nqpkv_8_m_8_sm_120_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_E4M3, 128, 1, 8, 8, 128, true, false, kSM_120, xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_128_nqpkv_8_m_8_sm_120_cubin, xqa_kernel_dt_bf16_d_128_beam_1_kvt_e4m3_pagedKV_128_nqpkv_8_m_8_sm_120_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 256, 1, 8, 8, 0, false, false, kSM_120, xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_nqpkv_8_m_8_sm_120_cubin, xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_nqpkv_8_m_8_sm_120_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 256, 1, 8, 8, 16, true, false, kSM_120, xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_pagedKV_16_nqpkv_8_m_8_sm_120_cubin, xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_pagedKV_16_nqpkv_8_m_8_sm_120_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 256, 1, 8, 8, 32, true, false, kSM_120, xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_pagedKV_32_nqpkv_8_m_8_sm_120_cubin, xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_pagedKV_32_nqpkv_8_m_8_sm_120_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 256, 1, 8, 8, 64, true, false, kSM_120, xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_pagedKV_64_nqpkv_8_m_8_sm_120_cubin, xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_pagedKV_64_nqpkv_8_m_8_sm_120_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 256, 1, 8, 8, 128, true, false, kSM_120, xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_pagedKV_128_nqpkv_8_m_8_sm_120_cubin, xqa_kernel_dt_bf16_d_256_beam_1_kvt_bf16_pagedKV_128_nqpkv_8_m_8_sm_120_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 256, 1, 8, 8, 0, false, false, kSM_120, xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_nqpkv_8_m_8_sm_120_cubin, xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_nqpkv_8_m_8_sm_120_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 256, 1, 8, 8, 16, true, false, kSM_120, xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_pagedKV_16_nqpkv_8_m_8_sm_120_cubin, xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_pagedKV_16_nqpkv_8_m_8_sm_120_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 256, 1, 8, 8, 32, true, false, kSM_120, xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_pagedKV_32_nqpkv_8_m_8_sm_120_cubin, xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_pagedKV_32_nqpkv_8_m_8_sm_120_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 256, 1, 8, 8, 64, true, false, kSM_120, xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_pagedKV_64_nqpkv_8_m_8_sm_120_cubin, xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_pagedKV_64_nqpkv_8_m_8_sm_120_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 256, 1, 8, 8, 128, true, false, kSM_120, xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_pagedKV_128_nqpkv_8_m_8_sm_120_cubin, xqa_kernel_dt_bf16_d_256_beam_1_kvt_int8_pagedKV_128_nqpkv_8_m_8_sm_120_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_E4M3, 256, 1, 8, 8, 0, false, false, kSM_120, xqa_kernel_dt_bf16_d_256_beam_1_kvt_e4m3_nqpkv_8_m_8_sm_120_cubin, xqa_kernel_dt_bf16_d_256_beam_1_kvt_e4m3_nqpkv_8_m_8_sm_120_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_E4M3, 256, 1, 8, 8, 16, true, false, kSM_120, xqa_kernel_dt_bf16_d_256_beam_1_kvt_e4m3_pagedKV_16_nqpkv_8_m_8_sm_120_cubin, xqa_kernel_dt_bf16_d_256_beam_1_kvt_e4m3_pagedKV_16_nqpkv_8_m_8_sm_120_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_E4M3, 256, 1, 8, 8, 32, true, false, kSM_120, xqa_kernel_dt_bf16_d_256_beam_1_kvt_e4m3_pagedKV_32_nqpkv_8_m_8_sm_120_cubin, xqa_kernel_dt_bf16_d_256_beam_1_kvt_e4m3_pagedKV_32_nqpkv_8_m_8_sm_120_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_E4M3, 256, 1, 8, 8, 64, true, false, kSM_120, xqa_kernel_dt_bf16_d_256_beam_1_kvt_e4m3_pagedKV_64_nqpkv_8_m_8_sm_120_cubin, xqa_kernel_dt_bf16_d_256_beam_1_kvt_e4m3_pagedKV_64_nqpkv_8_m_8_sm_120_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_E4M3, 256, 1, 8, 8, 128, true, false, kSM_120, xqa_kernel_dt_bf16_d_256_beam_1_kvt_e4m3_pagedKV_128_nqpkv_8_m_8_sm_120_cubin, xqa_kernel_dt_bf16_d_256_beam_1_kvt_e4m3_pagedKV_128_nqpkv_8_m_8_sm_120_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 256, 4, 1, 4, 0, false, false, kSM_120, xqa_kernel_dt_fp16_d_256_beam_4_kvt_fp16_nqpkv_1_m_4_sm_120_cubin, xqa_kernel_dt_fp16_d_256_beam_4_kvt_fp16_nqpkv_1_m_4_sm_120_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 256, 4, 1, 4, 16, true, false, kSM_120, xqa_kernel_dt_fp16_d_256_beam_4_kvt_fp16_pagedKV_16_nqpkv_1_m_4_sm_120_cubin, xqa_kernel_dt_fp16_d_256_beam_4_kvt_fp16_pagedKV_16_nqpkv_1_m_4_sm_120_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 256, 4, 1, 4, 32, true, false, kSM_120, xqa_kernel_dt_fp16_d_256_beam_4_kvt_fp16_pagedKV_32_nqpkv_1_m_4_sm_120_cubin, xqa_kernel_dt_fp16_d_256_beam_4_kvt_fp16_pagedKV_32_nqpkv_1_m_4_sm_120_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 256, 4, 1, 4, 64, true, false, kSM_120, xqa_kernel_dt_fp16_d_256_beam_4_kvt_fp16_pagedKV_64_nqpkv_1_m_4_sm_120_cubin, xqa_kernel_dt_fp16_d_256_beam_4_kvt_fp16_pagedKV_64_nqpkv_1_m_4_sm_120_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, 256, 4, 1, 4, 128, true, false, kSM_120, xqa_kernel_dt_fp16_d_256_beam_4_kvt_fp16_pagedKV_128_nqpkv_1_m_4_sm_120_cubin, xqa_kernel_dt_fp16_d_256_beam_4_kvt_fp16_pagedKV_128_nqpkv_1_m_4_sm_120_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 256, 4, 1, 4, 0, false, false, kSM_120, xqa_kernel_dt_fp16_d_256_beam_4_kvt_int8_nqpkv_1_m_4_sm_120_cubin, xqa_kernel_dt_fp16_d_256_beam_4_kvt_int8_nqpkv_1_m_4_sm_120_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 256, 4, 1, 4, 16, true, false, kSM_120, xqa_kernel_dt_fp16_d_256_beam_4_kvt_int8_pagedKV_16_nqpkv_1_m_4_sm_120_cubin, xqa_kernel_dt_fp16_d_256_beam_4_kvt_int8_pagedKV_16_nqpkv_1_m_4_sm_120_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 256, 4, 1, 4, 32, true, false, kSM_120, xqa_kernel_dt_fp16_d_256_beam_4_kvt_int8_pagedKV_32_nqpkv_1_m_4_sm_120_cubin, xqa_kernel_dt_fp16_d_256_beam_4_kvt_int8_pagedKV_32_nqpkv_1_m_4_sm_120_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 256, 4, 1, 4, 64, true, false, kSM_120, xqa_kernel_dt_fp16_d_256_beam_4_kvt_int8_pagedKV_64_nqpkv_1_m_4_sm_120_cubin, xqa_kernel_dt_fp16_d_256_beam_4_kvt_int8_pagedKV_64_nqpkv_1_m_4_sm_120_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_INT8, 256, 4, 1, 4, 128, true, false, kSM_120, xqa_kernel_dt_fp16_d_256_beam_4_kvt_int8_pagedKV_128_nqpkv_1_m_4_sm_120_cubin, xqa_kernel_dt_fp16_d_256_beam_4_kvt_int8_pagedKV_128_nqpkv_1_m_4_sm_120_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_E4M3, 256, 4, 1, 4, 0, false, false, kSM_120, xqa_kernel_dt_fp16_d_256_beam_4_kvt_e4m3_nqpkv_1_m_4_sm_120_cubin, xqa_kernel_dt_fp16_d_256_beam_4_kvt_e4m3_nqpkv_1_m_4_sm_120_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_E4M3, 256, 4, 1, 4, 16, true, false, kSM_120, xqa_kernel_dt_fp16_d_256_beam_4_kvt_e4m3_pagedKV_16_nqpkv_1_m_4_sm_120_cubin, xqa_kernel_dt_fp16_d_256_beam_4_kvt_e4m3_pagedKV_16_nqpkv_1_m_4_sm_120_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_E4M3, 256, 4, 1, 4, 32, true, false, kSM_120, xqa_kernel_dt_fp16_d_256_beam_4_kvt_e4m3_pagedKV_32_nqpkv_1_m_4_sm_120_cubin, xqa_kernel_dt_fp16_d_256_beam_4_kvt_e4m3_pagedKV_32_nqpkv_1_m_4_sm_120_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_E4M3, 256, 4, 1, 4, 64, true, false, kSM_120, xqa_kernel_dt_fp16_d_256_beam_4_kvt_e4m3_pagedKV_64_nqpkv_1_m_4_sm_120_cubin, xqa_kernel_dt_fp16_d_256_beam_4_kvt_e4m3_pagedKV_64_nqpkv_1_m_4_sm_120_cubin_len, "kernel_mha"}, -{ DATA_TYPE_FP16, DATA_TYPE_E4M3, 256, 4, 1, 4, 128, true, false, kSM_120, xqa_kernel_dt_fp16_d_256_beam_4_kvt_e4m3_pagedKV_128_nqpkv_1_m_4_sm_120_cubin, xqa_kernel_dt_fp16_d_256_beam_4_kvt_e4m3_pagedKV_128_nqpkv_1_m_4_sm_120_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 256, 4, 1, 4, 0, false, false, kSM_120, xqa_kernel_dt_bf16_d_256_beam_4_kvt_bf16_nqpkv_1_m_4_sm_120_cubin, xqa_kernel_dt_bf16_d_256_beam_4_kvt_bf16_nqpkv_1_m_4_sm_120_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 256, 4, 1, 4, 16, true, false, kSM_120, xqa_kernel_dt_bf16_d_256_beam_4_kvt_bf16_pagedKV_16_nqpkv_1_m_4_sm_120_cubin, xqa_kernel_dt_bf16_d_256_beam_4_kvt_bf16_pagedKV_16_nqpkv_1_m_4_sm_120_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 256, 4, 1, 4, 32, true, false, kSM_120, xqa_kernel_dt_bf16_d_256_beam_4_kvt_bf16_pagedKV_32_nqpkv_1_m_4_sm_120_cubin, xqa_kernel_dt_bf16_d_256_beam_4_kvt_bf16_pagedKV_32_nqpkv_1_m_4_sm_120_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 256, 4, 1, 4, 64, true, false, kSM_120, xqa_kernel_dt_bf16_d_256_beam_4_kvt_bf16_pagedKV_64_nqpkv_1_m_4_sm_120_cubin, xqa_kernel_dt_bf16_d_256_beam_4_kvt_bf16_pagedKV_64_nqpkv_1_m_4_sm_120_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, 256, 4, 1, 4, 128, true, false, kSM_120, xqa_kernel_dt_bf16_d_256_beam_4_kvt_bf16_pagedKV_128_nqpkv_1_m_4_sm_120_cubin, xqa_kernel_dt_bf16_d_256_beam_4_kvt_bf16_pagedKV_128_nqpkv_1_m_4_sm_120_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 256, 4, 1, 4, 0, false, false, kSM_120, xqa_kernel_dt_bf16_d_256_beam_4_kvt_int8_nqpkv_1_m_4_sm_120_cubin, xqa_kernel_dt_bf16_d_256_beam_4_kvt_int8_nqpkv_1_m_4_sm_120_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 256, 4, 1, 4, 16, true, false, kSM_120, xqa_kernel_dt_bf16_d_256_beam_4_kvt_int8_pagedKV_16_nqpkv_1_m_4_sm_120_cubin, xqa_kernel_dt_bf16_d_256_beam_4_kvt_int8_pagedKV_16_nqpkv_1_m_4_sm_120_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 256, 4, 1, 4, 32, true, false, kSM_120, xqa_kernel_dt_bf16_d_256_beam_4_kvt_int8_pagedKV_32_nqpkv_1_m_4_sm_120_cubin, xqa_kernel_dt_bf16_d_256_beam_4_kvt_int8_pagedKV_32_nqpkv_1_m_4_sm_120_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 256, 4, 1, 4, 64, true, false, kSM_120, xqa_kernel_dt_bf16_d_256_beam_4_kvt_int8_pagedKV_64_nqpkv_1_m_4_sm_120_cubin, xqa_kernel_dt_bf16_d_256_beam_4_kvt_int8_pagedKV_64_nqpkv_1_m_4_sm_120_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_INT8, 256, 4, 1, 4, 128, true, false, kSM_120, xqa_kernel_dt_bf16_d_256_beam_4_kvt_int8_pagedKV_128_nqpkv_1_m_4_sm_120_cubin, xqa_kernel_dt_bf16_d_256_beam_4_kvt_int8_pagedKV_128_nqpkv_1_m_4_sm_120_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_E4M3, 256, 4, 1, 4, 0, false, false, kSM_120, xqa_kernel_dt_bf16_d_256_beam_4_kvt_e4m3_nqpkv_1_m_4_sm_120_cubin, xqa_kernel_dt_bf16_d_256_beam_4_kvt_e4m3_nqpkv_1_m_4_sm_120_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_E4M3, 256, 4, 1, 4, 16, true, false, kSM_120, xqa_kernel_dt_bf16_d_256_beam_4_kvt_e4m3_pagedKV_16_nqpkv_1_m_4_sm_120_cubin, xqa_kernel_dt_bf16_d_256_beam_4_kvt_e4m3_pagedKV_16_nqpkv_1_m_4_sm_120_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_E4M3, 256, 4, 1, 4, 32, true, false, kSM_120, xqa_kernel_dt_bf16_d_256_beam_4_kvt_e4m3_pagedKV_32_nqpkv_1_m_4_sm_120_cubin, xqa_kernel_dt_bf16_d_256_beam_4_kvt_e4m3_pagedKV_32_nqpkv_1_m_4_sm_120_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_E4M3, 256, 4, 1, 4, 64, true, false, kSM_120, xqa_kernel_dt_bf16_d_256_beam_4_kvt_e4m3_pagedKV_64_nqpkv_1_m_4_sm_120_cubin, xqa_kernel_dt_bf16_d_256_beam_4_kvt_e4m3_pagedKV_64_nqpkv_1_m_4_sm_120_cubin_len, "kernel_mha"}, -{ DATA_TYPE_BF16, DATA_TYPE_E4M3, 256, 4, 1, 4, 128, true, false, kSM_120, xqa_kernel_dt_bf16_d_256_beam_4_kvt_e4m3_pagedKV_128_nqpkv_1_m_4_sm_120_cubin, xqa_kernel_dt_bf16_d_256_beam_4_kvt_e4m3_pagedKV_128_nqpkv_1_m_4_sm_120_cubin_len, "kernel_mha"} - -#endif + char const* mFuncName; }; +extern XQAKernelMetaInfo const sXqaKernelMetaInfo[]; +extern size_t const sXqaKernelMetaInfoSize; + // clang-format on } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttentionLaunch.h b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttentionLaunch.h index c85f2f2c30..bf6b22385e 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttentionLaunch.h +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttentionLaunch.h @@ -17,6 +17,7 @@ #include "decoderMaskedMultiheadAttentionTemplate.h" #include "tensorrt_llm/common/assert.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/envUtils.h" #include "tensorrt_llm/kernels/decoderMaskedMultiheadAttention.h" #include "tensorrt_llm/kernels/gptKernels.h" @@ -32,8 +33,8 @@ using namespace tensorrt_llm::common; -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { @@ -492,4 +493,5 @@ void mmha_launch_kernel(KernelParamsType const& params, KVCacheBuffer const& kv_ const KVLinearBuffer& shift_k_cache, const cudaStream_t& stream); } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttentionTemplate.h b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttentionTemplate.h index 21b9112b9f..5bb632465d 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttentionTemplate.h +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttentionTemplate.h @@ -15,6 +15,7 @@ */ #pragma once +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaTypeUtils.cuh" #include "tensorrt_llm/common/memoryUtils.h" #include "tensorrt_llm/kernels/decoderMaskedMultiheadAttention.h" @@ -37,8 +38,8 @@ #include #endif // ENABLE_MULTI_BLOCK_OPTION -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { @@ -2753,4 +2754,5 @@ __global__ void __launch_bounds__(MAX_THEADS_PER_BLOCK, MIN_BLOCKS_PER_SM) maske } // namespace mmha } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAConstants.h b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAConstants.h index 6e8dce40ac..647e92cc76 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAConstants.h +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAConstants.h @@ -16,11 +16,12 @@ * This file contains constants that decoderXQA*.{h,cpp} need. */ #pragma once +#include "tensorrt_llm/common/config.h" #include #include -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { inline constexpr int kMinHistoryTokensPerBlock = 128; @@ -40,4 +41,5 @@ inline constexpr int getXqaMaxNumSubSeq(bool isMLA) } } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImpl.cpp b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImpl.cpp index 20588b0afa..8ac26a0cc8 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImpl.cpp +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImpl.cpp @@ -14,6 +14,7 @@ * limitations under the License. */ #include "tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImpl.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/decoderXQAImplJIT.h" #include "tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplPrecompiled.h" @@ -22,8 +23,8 @@ #include #include -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { @@ -52,4 +53,5 @@ std::unique_ptr DecoderXQAImpl::create(DecoderXQARunner* runner, } } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImpl.h b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImpl.h index f43c186d8c..7d39f36da2 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImpl.h +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImpl.h @@ -14,13 +14,14 @@ * limitations under the License. */ #pragma once +#include "tensorrt_llm/common/config.h" #include #include "tensorrt_llm/kernels/decoderMaskedMultiheadAttention/xqaParams.h" #include "tensorrt_llm/kernels/kvCacheUtils.h" -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { @@ -84,4 +85,5 @@ enum class XQAKernelType : int32_t }; } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplCommon.cpp b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplCommon.cpp index bcdac05b91..dffc83764e 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplCommon.cpp +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplCommon.cpp @@ -16,8 +16,11 @@ * Common utils to be shared between Precompiled and JIT implementation. */ #include "tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplCommon.h" +#include "tensorrt_llm/common/config.h" -namespace tensorrt_llm::kernels +TRTLLM_NAMESPACE_BEGIN + +namespace kernels { uint32_t getKernelMTileSize( @@ -59,4 +62,6 @@ XQAKernelRuntimeHashKey getRuntimeHashKeyFromXQAParams(XQAParams const& xqaParam isXqaJit ? std::optional(xqaParams.position_embedding_type) : std::nullopt}; } -} // namespace tensorrt_llm::kernels +} // namespace kernels + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplCommon.h b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplCommon.h index f2dcb7a858..eb907edff1 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplCommon.h +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplCommon.h @@ -18,6 +18,7 @@ #pragma once #include "decoderXQAConstants.h" #include "tensorrt_llm/common/assert.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaDriverWrapper.h" #include "tensorrt_llm/common/cudaUtils.h" #include "tensorrt_llm/common/envUtils.h" @@ -30,8 +31,8 @@ #include #include -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { @@ -482,4 +483,5 @@ inline int computeMultiBlockCountSpecDecGMMA( } } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/compileEngine.cpp b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/compileEngine.cpp index f0c71f3766..33587d7961 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/compileEngine.cpp +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/compileEngine.cpp @@ -18,6 +18,7 @@ #include "cubinObj.h" #include "nvrtcWrapper/include/nvrtcWrapper.h" #include "tensorrt_llm/common/assert.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/stringUtils.h" #include "tensorrt_llm/common/tllmException.h" #include "tensorrt_llm/common/utils.h" @@ -44,8 +45,8 @@ void CHECK_TLLM_XQA_JIT_ERROR_(tllmXqaJitStatus result, char const* const func, } // anonymous namespace -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { namespace jit @@ -133,4 +134,5 @@ CompileEngine::CompileEngine(int SM, XQAParams const& xqaParams) } // namespace jit } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/compileEngine.h b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/compileEngine.h index 01db871995..8995e03dd0 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/compileEngine.h +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/compileEngine.h @@ -15,12 +15,13 @@ */ #pragma once #include "cubinObj.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/kernels/decoderMaskedMultiheadAttention/xqaParams.h" #include #include -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { namespace jit @@ -43,4 +44,5 @@ private: } // namespace jit } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/cubinObj.cpp b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/cubinObj.cpp index f5910b5817..b57eec1b14 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/cubinObj.cpp +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/cubinObj.cpp @@ -17,12 +17,15 @@ #include "serializationUtils.h" #include "tensorrt_llm/common/assert.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaDriverWrapper.h" #include "tensorrt_llm/common/cudaUtils.h" #include "tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplCommon.h" #include -namespace tensorrt_llm::kernels::jit +TRTLLM_NAMESPACE_BEGIN + +namespace kernels::jit { CubinObj::CubinObj(void const* buffer_, size_t buffer_size) @@ -184,4 +187,6 @@ CubinObj::~CubinObj() } } -} // namespace tensorrt_llm::kernels::jit +} // namespace kernels::jit + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/cubinObj.h b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/cubinObj.h index 4eb3ca1095..3cb176407f 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/cubinObj.h +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/cubinObj.h @@ -14,14 +14,15 @@ * limitations under the License. */ #pragma once +#include "tensorrt_llm/common/config.h" #include #include #include "tensorrt_llm/common/cudaDriverWrapper.h" #include "tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImpl.h" -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { namespace jit @@ -86,4 +87,5 @@ private: } // namespace jit } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/cubinObjRegistry.h b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/cubinObjRegistry.h index 468cd77bc1..2eb9ef89db 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/cubinObjRegistry.h +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/cubinObjRegistry.h @@ -18,13 +18,16 @@ #include "compileEngine.h" #include "serializationUtils.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplCommon.h" #include #include #include -namespace tensorrt_llm::kernels::jit +TRTLLM_NAMESPACE_BEGIN + +namespace kernels::jit { // A thread-safe collection of CubinObjs, with caching functionality. @@ -173,4 +176,6 @@ using CubinObjKey = XQAKernelFullHashKey; using CubinObjHasher = XQAKernelFullHasher; using CubinObjRegistry = CubinObjRegistryTemplate; -} // namespace tensorrt_llm::kernels::jit +} // namespace kernels::jit + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/decoderXQAImplJIT.cpp b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/decoderXQAImplJIT.cpp index 03295d6d16..90dda051a0 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/decoderXQAImplJIT.cpp +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/decoderXQAImplJIT.cpp @@ -17,6 +17,7 @@ #include "tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/decoderXQAImplJIT.h" #include "compileEngine.h" #include "tensorrt_llm/common/assert.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/envUtils.h" #include "tensorrt_llm/common/utils.h" #include "tensorrt_llm/kernels/decoderMaskedMultiheadAttention/cubin/xqa_kernel_cubin.h" @@ -43,7 +44,9 @@ XQAKernelRuntimeHashKey getRuntimeHashKeyFromKernelMeta(XQAKernelMetaInfo const& } // anonymous namespace -namespace tensorrt_llm::kernels +TRTLLM_NAMESPACE_BEGIN + +namespace kernels { DecoderXQAImplJIT::DecoderXQAImplJIT(DecoderXQARunner* runner) @@ -545,4 +548,6 @@ void DecoderXQAImplJIT::runImpl(XQAParams const& xqaParams, KVCacheBuffer const& } } -} // namespace tensorrt_llm::kernels +} // namespace kernels + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/decoderXQAImplJIT.h b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/decoderXQAImplJIT.h index b051d7bd35..902ec0b809 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/decoderXQAImplJIT.h +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/decoderXQAImplJIT.h @@ -14,6 +14,7 @@ * limitations under the License. */ #pragma once +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImpl.h" #include "compileEngine.h" @@ -23,8 +24,8 @@ #include "tensorrt_llm/plugins/common/plugin.h" #include -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { @@ -75,4 +76,5 @@ private: }; } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/kernelUtils.cpp b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/kernelUtils.cpp index c19b482b30..26fadd21cc 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/kernelUtils.cpp +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/kernelUtils.cpp @@ -14,12 +14,13 @@ * limitations under the License. */ #include "tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/kernelUtils.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/utils.h" #include "tensorrt_llm/kernels/multiHeadAttentionCommon.h" #include -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { namespace jit @@ -205,4 +206,5 @@ bool supportConfigMLA(XQAParams const& xqaParams, int SM, bool forConfigurePlugi } // namespace jit } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/kernelUtils.h b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/kernelUtils.h index c67e54459c..8d3b43b44f 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/kernelUtils.h +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/kernelUtils.h @@ -14,11 +14,12 @@ * limitations under the License. */ #pragma once +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/kernels/decoderMaskedMultiheadAttention/xqaParams.h" #include "tensorrt_llm/kernels/trtllmGenKernels/fmha/fmhaRunner.h" -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { namespace jit @@ -32,4 +33,5 @@ bool supportConfigTllmGen( } // namespace jit } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/serializationUtils.h b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/serializationUtils.h index f48af0f7c8..456680907d 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/serializationUtils.h +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/serializationUtils.h @@ -14,13 +14,14 @@ * limitations under the License. */ #pragma once +#include "tensorrt_llm/common/assert.h" #include #include -#include "tensorrt_llm/common/assert.h" +#include "tensorrt_llm/common/config.h" + +TRTLLM_NAMESPACE_BEGIN -namespace tensorrt_llm -{ namespace kernels { namespace jit @@ -49,4 +50,5 @@ void writeToBuffer(T output, uint8_t*& buffer, size_t& remaining_buffer_size) } // namespace jit } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplPrecompiled.cpp b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplPrecompiled.cpp index 2cf90486d3..7bd7c32e5e 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplPrecompiled.cpp +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplPrecompiled.cpp @@ -15,6 +15,7 @@ */ #include "tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplPrecompiled.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaDriverWrapper.h" #include "tensorrt_llm/common/envUtils.h" #include "tensorrt_llm/common/workspace.h" @@ -33,7 +34,9 @@ using namespace tensorrt_llm::common; -namespace tensorrt_llm::kernels +TRTLLM_NAMESPACE_BEGIN + +namespace kernels { class XQAKernelList @@ -44,7 +47,7 @@ public: XQAKernelList(Data_type type, unsigned int sm) : mDriver(tensorrt_llm::common::CUDADriverWrapper::getInstance()) , mDataType(type) - , mKernelMetaCount(sizeof(sXqaKernelMetaInfo) / sizeof(sXqaKernelMetaInfo[0])) + , mKernelMetaCount(sXqaKernelMetaInfoSize) , mKernelMeta(&sXqaKernelMetaInfo[0]) , mSM(sm) { @@ -557,4 +560,6 @@ void DecoderXQAImplPrecompiled::runWithKVBlockArray( runDispatchBuffer(xqa_params, kv_block_array, stream); } -} // namespace tensorrt_llm::kernels +} // namespace kernels + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplPrecompiled.h b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplPrecompiled.h index e41d637597..7f48b47468 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplPrecompiled.h +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplPrecompiled.h @@ -14,10 +14,11 @@ * limitations under the License. */ #pragma once +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImpl.h" -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { @@ -47,4 +48,5 @@ private: }; } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQARunner.cpp b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQARunner.cpp index 946fea5a7e..165ffc2848 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQARunner.cpp +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQARunner.cpp @@ -22,6 +22,7 @@ #include #include +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaUtils.h" #include "tensorrt_llm/common/envUtils.h" #include "tensorrt_llm/common/workspace.h" @@ -31,8 +32,8 @@ #include "tensorrt_llm/kernels/kvCacheUtils.h" #include "tensorrt_llm/kernels/unfusedAttentionKernels.h" -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { @@ -181,4 +182,4 @@ void DecoderXQARunnerResource::serialize(void* buffer, size_t buffer_size) const } // namespace kernels -} // namespace tensorrt_llm +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQARunner.h b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQARunner.h index 1604c697fe..b53bd4a94e 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQARunner.h +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQARunner.h @@ -20,6 +20,7 @@ #include #include "tensorrt_llm/common/assert.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaUtils.h" #include "tensorrt_llm/common/quantization.h" #include "tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/cubinObjRegistry.h" @@ -32,8 +33,8 @@ using namespace tensorrt_llm::common; -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { @@ -157,4 +158,5 @@ private: }; } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention104_bf16.cu b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention104_bf16.cu index 4ed7b39b88..1d24c2fc3e 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention104_bf16.cu +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention104_bf16.cu @@ -15,9 +15,10 @@ */ #include "../decoderMaskedMultiheadAttentionLaunch.h" +#include "tensorrt_llm/common/config.h" + +TRTLLM_NAMESPACE_BEGIN -namespace tensorrt_llm -{ namespace kernels { @@ -38,4 +39,5 @@ INSTANTIATE_MMHA_LAUNCHERS(__nv_bfloat16, kSizePerHead) } // namespace mmha } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention104_float.cu b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention104_float.cu index bbc6e0ed17..99e185e64d 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention104_float.cu +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention104_float.cu @@ -15,9 +15,10 @@ */ #include "../decoderMaskedMultiheadAttentionLaunch.h" +#include "tensorrt_llm/common/config.h" + +TRTLLM_NAMESPACE_BEGIN -namespace tensorrt_llm -{ namespace kernels { @@ -36,4 +37,5 @@ INSTANTIATE_MMHA_LAUNCHERS(float, kSizePerHead) } // namespace mmha } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention104_half.cu b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention104_half.cu index 17e3601acf..c863acba6b 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention104_half.cu +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention104_half.cu @@ -15,9 +15,10 @@ */ #include "../decoderMaskedMultiheadAttentionLaunch.h" +#include "tensorrt_llm/common/config.h" + +TRTLLM_NAMESPACE_BEGIN -namespace tensorrt_llm -{ namespace kernels { @@ -36,4 +37,5 @@ INSTANTIATE_MMHA_LAUNCHERS(uint16_t, kSizePerHead) } // namespace mmha } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention112_bf16.cu b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention112_bf16.cu index bdce42d97c..e98633b4f5 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention112_bf16.cu +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention112_bf16.cu @@ -15,9 +15,10 @@ */ #include "../decoderMaskedMultiheadAttentionLaunch.h" +#include "tensorrt_llm/common/config.h" + +TRTLLM_NAMESPACE_BEGIN -namespace tensorrt_llm -{ namespace kernels { @@ -38,4 +39,5 @@ INSTANTIATE_MMHA_LAUNCHERS(__nv_bfloat16, kSizePerHead) } // namespace mmha } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention112_float.cu b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention112_float.cu index bcc07aa8a0..20681f3274 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention112_float.cu +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention112_float.cu @@ -15,9 +15,10 @@ */ #include "../decoderMaskedMultiheadAttentionLaunch.h" +#include "tensorrt_llm/common/config.h" + +TRTLLM_NAMESPACE_BEGIN -namespace tensorrt_llm -{ namespace kernels { @@ -36,4 +37,5 @@ INSTANTIATE_MMHA_LAUNCHERS(float, kSizePerHead) } // namespace mmha } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention112_half.cu b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention112_half.cu index 0b6497b092..cc870a5256 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention112_half.cu +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention112_half.cu @@ -15,9 +15,10 @@ */ #include "../decoderMaskedMultiheadAttentionLaunch.h" +#include "tensorrt_llm/common/config.h" + +TRTLLM_NAMESPACE_BEGIN -namespace tensorrt_llm -{ namespace kernels { @@ -36,4 +37,5 @@ INSTANTIATE_MMHA_LAUNCHERS(uint16_t, kSizePerHead) } // namespace mmha } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention128_bf16.cu b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention128_bf16.cu index 3eacc7a74f..d971b5d76e 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention128_bf16.cu +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention128_bf16.cu @@ -15,9 +15,10 @@ */ #include "../decoderMaskedMultiheadAttentionLaunch.h" +#include "tensorrt_llm/common/config.h" + +TRTLLM_NAMESPACE_BEGIN -namespace tensorrt_llm -{ namespace kernels { @@ -36,4 +37,5 @@ INSTANTIATE_MMHA_LAUNCHERS(__nv_bfloat16, kSizePerHead) } // namespace mmha } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention128_bf16_block_sparse_attn.cu b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention128_bf16_block_sparse_attn.cu index 65e747caf6..e60b735945 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention128_bf16_block_sparse_attn.cu +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention128_bf16_block_sparse_attn.cu @@ -15,9 +15,10 @@ */ #include "../decoderMaskedMultiheadAttentionLaunch.h" +#include "tensorrt_llm/common/config.h" + +TRTLLM_NAMESPACE_BEGIN -namespace tensorrt_llm -{ namespace kernels { @@ -36,4 +37,5 @@ INSTANTIATE_MMHA_LAUNCHERS_WITH_BLOCK_SPARSE_ATTN(__nv_bfloat16, kSizePerHead) } // namespace mmha } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention128_bf16_implicit_relative_attn.cu b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention128_bf16_implicit_relative_attn.cu index a43569fa1a..64df7751fb 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention128_bf16_implicit_relative_attn.cu +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention128_bf16_implicit_relative_attn.cu @@ -15,9 +15,10 @@ */ #include "../decoderMaskedMultiheadAttentionLaunch.h" +#include "tensorrt_llm/common/config.h" + +TRTLLM_NAMESPACE_BEGIN -namespace tensorrt_llm -{ namespace kernels { @@ -36,4 +37,5 @@ INSTANTIATE_MMHA_LAUNCHERS_WITH_IMPLICIT_REL_ATTN_BIAS(__nv_bfloat16, kSizePerHe } // namespace mmha } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention128_bf16_qk_tanh_scale.cu b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention128_bf16_qk_tanh_scale.cu index 48f2a413f0..afa21e48ca 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention128_bf16_qk_tanh_scale.cu +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention128_bf16_qk_tanh_scale.cu @@ -15,9 +15,10 @@ */ #include "../decoderMaskedMultiheadAttentionLaunch.h" +#include "tensorrt_llm/common/config.h" + +TRTLLM_NAMESPACE_BEGIN -namespace tensorrt_llm -{ namespace kernels { @@ -36,4 +37,5 @@ INSTANTIATE_MMHA_LAUNCHERS_WITH_ATTN_LOGIT_SOFTCAPPING_SCALE(__nv_bfloat16, kSiz } // namespace mmha } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention128_float.cu b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention128_float.cu index fc652b5a4f..bb7ecbafea 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention128_float.cu +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention128_float.cu @@ -15,9 +15,10 @@ */ #include "../decoderMaskedMultiheadAttentionLaunch.h" +#include "tensorrt_llm/common/config.h" + +TRTLLM_NAMESPACE_BEGIN -namespace tensorrt_llm -{ namespace kernels { @@ -34,4 +35,5 @@ INSTANTIATE_MMHA_LAUNCHERS(float, kSizePerHead) } // namespace mmha } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention128_float_block_sparse_attn.cu b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention128_float_block_sparse_attn.cu index ee15867353..0914573412 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention128_float_block_sparse_attn.cu +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention128_float_block_sparse_attn.cu @@ -15,9 +15,10 @@ */ #include "../decoderMaskedMultiheadAttentionLaunch.h" +#include "tensorrt_llm/common/config.h" + +TRTLLM_NAMESPACE_BEGIN -namespace tensorrt_llm -{ namespace kernels { @@ -34,4 +35,5 @@ INSTANTIATE_MMHA_LAUNCHERS_WITH_BLOCK_SPARSE_ATTN(float, kSizePerHead) } // namespace mmha } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention128_float_implicit_relative_attn.cu b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention128_float_implicit_relative_attn.cu index 74c708b767..3aa0970e0b 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention128_float_implicit_relative_attn.cu +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention128_float_implicit_relative_attn.cu @@ -15,9 +15,10 @@ */ #include "../decoderMaskedMultiheadAttentionLaunch.h" +#include "tensorrt_llm/common/config.h" + +TRTLLM_NAMESPACE_BEGIN -namespace tensorrt_llm -{ namespace kernels { @@ -34,4 +35,5 @@ INSTANTIATE_MMHA_LAUNCHERS_WITH_IMPLICIT_REL_ATTN_BIAS(float, kSizePerHead) } // namespace mmha } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention128_float_qk_tanh_scale.cu b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention128_float_qk_tanh_scale.cu index e12f887d8f..0a4573c21a 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention128_float_qk_tanh_scale.cu +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention128_float_qk_tanh_scale.cu @@ -15,9 +15,10 @@ */ #include "../decoderMaskedMultiheadAttentionLaunch.h" +#include "tensorrt_llm/common/config.h" + +TRTLLM_NAMESPACE_BEGIN -namespace tensorrt_llm -{ namespace kernels { @@ -34,4 +35,5 @@ INSTANTIATE_MMHA_LAUNCHERS_WITH_ATTN_LOGIT_SOFTCAPPING_SCALE(float, kSizePerHead } // namespace mmha } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention128_half.cu b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention128_half.cu index 4078e2bc60..3a224a79f2 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention128_half.cu +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention128_half.cu @@ -15,9 +15,10 @@ */ #include "../decoderMaskedMultiheadAttentionLaunch.h" +#include "tensorrt_llm/common/config.h" + +TRTLLM_NAMESPACE_BEGIN -namespace tensorrt_llm -{ namespace kernels { @@ -34,4 +35,5 @@ INSTANTIATE_MMHA_LAUNCHERS(uint16_t, kSizePerHead) } // namespace mmha } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention128_half_block_sparse_attn.cu b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention128_half_block_sparse_attn.cu index 4f61bf42a2..cb0574baad 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention128_half_block_sparse_attn.cu +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention128_half_block_sparse_attn.cu @@ -15,9 +15,10 @@ */ #include "../decoderMaskedMultiheadAttentionLaunch.h" +#include "tensorrt_llm/common/config.h" + +TRTLLM_NAMESPACE_BEGIN -namespace tensorrt_llm -{ namespace kernels { @@ -34,4 +35,5 @@ INSTANTIATE_MMHA_LAUNCHERS_WITH_BLOCK_SPARSE_ATTN(uint16_t, kSizePerHead) } // namespace mmha } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention128_half_implicit_relative_attn.cu b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention128_half_implicit_relative_attn.cu index 867c2df240..b02a92a351 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention128_half_implicit_relative_attn.cu +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention128_half_implicit_relative_attn.cu @@ -15,9 +15,10 @@ */ #include "../decoderMaskedMultiheadAttentionLaunch.h" +#include "tensorrt_llm/common/config.h" + +TRTLLM_NAMESPACE_BEGIN -namespace tensorrt_llm -{ namespace kernels { @@ -34,4 +35,5 @@ INSTANTIATE_MMHA_LAUNCHERS_WITH_IMPLICIT_REL_ATTN_BIAS(uint16_t, kSizePerHead) } // namespace mmha } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention128_half_qk_tanh_scale.cu b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention128_half_qk_tanh_scale.cu index 8b7d988b0c..40de9b4dd7 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention128_half_qk_tanh_scale.cu +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention128_half_qk_tanh_scale.cu @@ -15,9 +15,10 @@ */ #include "../decoderMaskedMultiheadAttentionLaunch.h" +#include "tensorrt_llm/common/config.h" + +TRTLLM_NAMESPACE_BEGIN -namespace tensorrt_llm -{ namespace kernels { @@ -34,4 +35,5 @@ INSTANTIATE_MMHA_LAUNCHERS_WITH_ATTN_LOGIT_SOFTCAPPING_SCALE(uint16_t, kSizePerH } // namespace mmha } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention144_bf16.cu b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention144_bf16.cu index 72aab18ab9..8cfc95fec6 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention144_bf16.cu +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention144_bf16.cu @@ -15,9 +15,10 @@ */ #include "../decoderMaskedMultiheadAttentionLaunch.h" +#include "tensorrt_llm/common/config.h" + +TRTLLM_NAMESPACE_BEGIN -namespace tensorrt_llm -{ namespace kernels { @@ -38,4 +39,5 @@ INSTANTIATE_MMHA_LAUNCHERS(__nv_bfloat16, kSizePerHead) } // namespace mmha } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention144_float.cu b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention144_float.cu index d5b2ab6627..825add47ff 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention144_float.cu +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention144_float.cu @@ -15,9 +15,10 @@ */ #include "../decoderMaskedMultiheadAttentionLaunch.h" +#include "tensorrt_llm/common/config.h" + +TRTLLM_NAMESPACE_BEGIN -namespace tensorrt_llm -{ namespace kernels { @@ -36,4 +37,5 @@ INSTANTIATE_MMHA_LAUNCHERS(float, kSizePerHead) } // namespace mmha } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention144_half.cu b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention144_half.cu index 79d3f3920a..a07e1340ed 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention144_half.cu +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention144_half.cu @@ -15,9 +15,10 @@ */ #include "../decoderMaskedMultiheadAttentionLaunch.h" +#include "tensorrt_llm/common/config.h" + +TRTLLM_NAMESPACE_BEGIN -namespace tensorrt_llm -{ namespace kernels { @@ -36,4 +37,5 @@ INSTANTIATE_MMHA_LAUNCHERS(uint16_t, kSizePerHead) } // namespace mmha } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention160_bf16.cu b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention160_bf16.cu index bd65335d75..7230657f3e 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention160_bf16.cu +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention160_bf16.cu @@ -15,9 +15,10 @@ */ #include "../decoderMaskedMultiheadAttentionLaunch.h" +#include "tensorrt_llm/common/config.h" + +TRTLLM_NAMESPACE_BEGIN -namespace tensorrt_llm -{ namespace kernels { @@ -38,4 +39,5 @@ INSTANTIATE_MMHA_LAUNCHERS(__nv_bfloat16, kSizePerHead) } // namespace mmha } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention160_float.cu b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention160_float.cu index e7f7f1bf76..09b32df680 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention160_float.cu +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention160_float.cu @@ -15,9 +15,10 @@ */ #include "../decoderMaskedMultiheadAttentionLaunch.h" +#include "tensorrt_llm/common/config.h" + +TRTLLM_NAMESPACE_BEGIN -namespace tensorrt_llm -{ namespace kernels { @@ -36,4 +37,5 @@ INSTANTIATE_MMHA_LAUNCHERS(float, kSizePerHead) } // namespace mmha } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention160_half.cu b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention160_half.cu index 8928b538c5..7c13505994 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention160_half.cu +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention160_half.cu @@ -15,9 +15,10 @@ */ #include "../decoderMaskedMultiheadAttentionLaunch.h" +#include "tensorrt_llm/common/config.h" + +TRTLLM_NAMESPACE_BEGIN -namespace tensorrt_llm -{ namespace kernels { @@ -36,4 +37,5 @@ INSTANTIATE_MMHA_LAUNCHERS(uint16_t, kSizePerHead) } // namespace mmha } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention192_bf16.cu b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention192_bf16.cu index 0229ec07b0..d799feb598 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention192_bf16.cu +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention192_bf16.cu @@ -15,9 +15,10 @@ */ #include "../decoderMaskedMultiheadAttentionLaunch.h" +#include "tensorrt_llm/common/config.h" + +TRTLLM_NAMESPACE_BEGIN -namespace tensorrt_llm -{ namespace kernels { @@ -38,4 +39,5 @@ INSTANTIATE_MMHA_LAUNCHERS(__nv_bfloat16, kSizePerHead) } // namespace mmha } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention192_float.cu b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention192_float.cu index 0fca76aa35..f79fa11615 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention192_float.cu +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention192_float.cu @@ -15,9 +15,10 @@ */ #include "../decoderMaskedMultiheadAttentionLaunch.h" +#include "tensorrt_llm/common/config.h" + +TRTLLM_NAMESPACE_BEGIN -namespace tensorrt_llm -{ namespace kernels { @@ -36,4 +37,5 @@ INSTANTIATE_MMHA_LAUNCHERS(float, kSizePerHead) } // namespace mmha } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention192_half.cu b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention192_half.cu index 181cf5c8f3..e49050ab7f 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention192_half.cu +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention192_half.cu @@ -15,9 +15,10 @@ */ #include "../decoderMaskedMultiheadAttentionLaunch.h" +#include "tensorrt_llm/common/config.h" + +TRTLLM_NAMESPACE_BEGIN -namespace tensorrt_llm -{ namespace kernels { @@ -36,4 +37,5 @@ INSTANTIATE_MMHA_LAUNCHERS(uint16_t, kSizePerHead) } // namespace mmha } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention224_bf16.cu b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention224_bf16.cu index d25a1d901f..b40711f997 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention224_bf16.cu +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention224_bf16.cu @@ -15,9 +15,10 @@ */ #include "../decoderMaskedMultiheadAttentionLaunch.h" +#include "tensorrt_llm/common/config.h" + +TRTLLM_NAMESPACE_BEGIN -namespace tensorrt_llm -{ namespace kernels { @@ -38,4 +39,5 @@ INSTANTIATE_MMHA_LAUNCHERS(__nv_bfloat16, kSizePerHead) } // namespace mmha } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention224_float.cu b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention224_float.cu index 3eded458eb..0dc1a472a1 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention224_float.cu +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention224_float.cu @@ -15,9 +15,10 @@ */ #include "../decoderMaskedMultiheadAttentionLaunch.h" +#include "tensorrt_llm/common/config.h" + +TRTLLM_NAMESPACE_BEGIN -namespace tensorrt_llm -{ namespace kernels { @@ -36,4 +37,5 @@ INSTANTIATE_MMHA_LAUNCHERS(float, kSizePerHead) } // namespace mmha } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention224_half.cu b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention224_half.cu index d80110c60e..2b63fb389e 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention224_half.cu +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention224_half.cu @@ -15,9 +15,10 @@ */ #include "../decoderMaskedMultiheadAttentionLaunch.h" +#include "tensorrt_llm/common/config.h" + +TRTLLM_NAMESPACE_BEGIN -namespace tensorrt_llm -{ namespace kernels { @@ -36,4 +37,5 @@ INSTANTIATE_MMHA_LAUNCHERS(uint16_t, kSizePerHead) } // namespace mmha } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention256_bf16.cu b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention256_bf16.cu index 33d1724961..696e2b9bab 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention256_bf16.cu +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention256_bf16.cu @@ -15,9 +15,10 @@ */ #include "../decoderMaskedMultiheadAttentionLaunch.h" +#include "tensorrt_llm/common/config.h" + +TRTLLM_NAMESPACE_BEGIN -namespace tensorrt_llm -{ namespace kernels { @@ -36,4 +37,5 @@ INSTANTIATE_MMHA_LAUNCHERS(__nv_bfloat16, kSizePerHead) } // namespace mmha } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention256_bf16_qk_tanh_scale.cu b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention256_bf16_qk_tanh_scale.cu index 786cbafca3..e18af09838 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention256_bf16_qk_tanh_scale.cu +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention256_bf16_qk_tanh_scale.cu @@ -15,9 +15,10 @@ */ #include "../decoderMaskedMultiheadAttentionLaunch.h" +#include "tensorrt_llm/common/config.h" + +TRTLLM_NAMESPACE_BEGIN -namespace tensorrt_llm -{ namespace kernels { @@ -36,4 +37,5 @@ INSTANTIATE_MMHA_LAUNCHERS_WITH_ATTN_LOGIT_SOFTCAPPING_SCALE(__nv_bfloat16, kSiz } // namespace mmha } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention256_float.cu b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention256_float.cu index 44e030d532..deb057598f 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention256_float.cu +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention256_float.cu @@ -15,9 +15,10 @@ */ #include "../decoderMaskedMultiheadAttentionLaunch.h" +#include "tensorrt_llm/common/config.h" + +TRTLLM_NAMESPACE_BEGIN -namespace tensorrt_llm -{ namespace kernels { @@ -34,4 +35,5 @@ INSTANTIATE_MMHA_LAUNCHERS(float, kSizePerHead) } // namespace mmha } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention256_float_qk_tanh_scale.cu b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention256_float_qk_tanh_scale.cu index 985a24c45b..7c5c498e08 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention256_float_qk_tanh_scale.cu +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention256_float_qk_tanh_scale.cu @@ -15,9 +15,10 @@ */ #include "../decoderMaskedMultiheadAttentionLaunch.h" +#include "tensorrt_llm/common/config.h" + +TRTLLM_NAMESPACE_BEGIN -namespace tensorrt_llm -{ namespace kernels { @@ -34,4 +35,5 @@ INSTANTIATE_MMHA_LAUNCHERS_WITH_ATTN_LOGIT_SOFTCAPPING_SCALE(float, kSizePerHead } // namespace mmha } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention256_half.cu b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention256_half.cu index 016b10fc50..90469e87d0 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention256_half.cu +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention256_half.cu @@ -15,9 +15,10 @@ */ #include "../decoderMaskedMultiheadAttentionLaunch.h" +#include "tensorrt_llm/common/config.h" + +TRTLLM_NAMESPACE_BEGIN -namespace tensorrt_llm -{ namespace kernels { @@ -34,4 +35,5 @@ INSTANTIATE_MMHA_LAUNCHERS(uint16_t, kSizePerHead) } // namespace mmha } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention256_half_qk_tanh_scale.cu b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention256_half_qk_tanh_scale.cu index 2a709eecd5..7d27fe99a9 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention256_half_qk_tanh_scale.cu +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention256_half_qk_tanh_scale.cu @@ -15,9 +15,10 @@ */ #include "../decoderMaskedMultiheadAttentionLaunch.h" +#include "tensorrt_llm/common/config.h" + +TRTLLM_NAMESPACE_BEGIN -namespace tensorrt_llm -{ namespace kernels { @@ -34,4 +35,5 @@ INSTANTIATE_MMHA_LAUNCHERS_WITH_ATTN_LOGIT_SOFTCAPPING_SCALE(uint16_t, kSizePerH } // namespace mmha } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention32_bf16.cu b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention32_bf16.cu index 6afa825ae8..bb6c6ee48d 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention32_bf16.cu +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention32_bf16.cu @@ -15,9 +15,10 @@ */ #include "../decoderMaskedMultiheadAttentionLaunch.h" +#include "tensorrt_llm/common/config.h" + +TRTLLM_NAMESPACE_BEGIN -namespace tensorrt_llm -{ namespace kernels { @@ -36,4 +37,5 @@ INSTANTIATE_MMHA_LAUNCHERS(__nv_bfloat16, kSizePerHead) } // namespace mmha } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention32_bf16_implicit_relative_attn.cu b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention32_bf16_implicit_relative_attn.cu index 1906b9816a..127477fd71 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention32_bf16_implicit_relative_attn.cu +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention32_bf16_implicit_relative_attn.cu @@ -15,9 +15,10 @@ */ #include "../decoderMaskedMultiheadAttentionLaunch.h" +#include "tensorrt_llm/common/config.h" + +TRTLLM_NAMESPACE_BEGIN -namespace tensorrt_llm -{ namespace kernels { @@ -36,4 +37,5 @@ INSTANTIATE_MMHA_LAUNCHERS_WITH_IMPLICIT_REL_ATTN_BIAS(__nv_bfloat16, kSizePerHe } // namespace mmha } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention32_float.cu b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention32_float.cu index 28ca9c7e82..9404f14a29 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention32_float.cu +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention32_float.cu @@ -15,9 +15,10 @@ */ #include "../decoderMaskedMultiheadAttentionLaunch.h" +#include "tensorrt_llm/common/config.h" + +TRTLLM_NAMESPACE_BEGIN -namespace tensorrt_llm -{ namespace kernels { @@ -34,4 +35,5 @@ INSTANTIATE_MMHA_LAUNCHERS(float, kSizePerHead) } // namespace mmha } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention32_float_implicit_relative_attn.cu b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention32_float_implicit_relative_attn.cu index 9550440780..b9fc4249b9 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention32_float_implicit_relative_attn.cu +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention32_float_implicit_relative_attn.cu @@ -15,9 +15,10 @@ */ #include "../decoderMaskedMultiheadAttentionLaunch.h" +#include "tensorrt_llm/common/config.h" + +TRTLLM_NAMESPACE_BEGIN -namespace tensorrt_llm -{ namespace kernels { @@ -34,4 +35,5 @@ INSTANTIATE_MMHA_LAUNCHERS_WITH_IMPLICIT_REL_ATTN_BIAS(float, kSizePerHead) } // namespace mmha } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention32_half.cu b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention32_half.cu index ba9ee36cc2..73d4bf4773 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention32_half.cu +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention32_half.cu @@ -15,9 +15,10 @@ */ #include "../decoderMaskedMultiheadAttentionLaunch.h" +#include "tensorrt_llm/common/config.h" + +TRTLLM_NAMESPACE_BEGIN -namespace tensorrt_llm -{ namespace kernels { @@ -34,4 +35,5 @@ INSTANTIATE_MMHA_LAUNCHERS(uint16_t, kSizePerHead) } // namespace mmha } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention32_half_implicit_relative_attn.cu b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention32_half_implicit_relative_attn.cu index 288338f946..5f289fad6f 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention32_half_implicit_relative_attn.cu +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention32_half_implicit_relative_attn.cu @@ -15,9 +15,10 @@ */ #include "../decoderMaskedMultiheadAttentionLaunch.h" +#include "tensorrt_llm/common/config.h" + +TRTLLM_NAMESPACE_BEGIN -namespace tensorrt_llm -{ namespace kernels { @@ -34,4 +35,5 @@ INSTANTIATE_MMHA_LAUNCHERS_WITH_IMPLICIT_REL_ATTN_BIAS(uint16_t, kSizePerHead) } // namespace mmha } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention48_bf16.cu b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention48_bf16.cu index 6cd98308a6..98f5956732 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention48_bf16.cu +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention48_bf16.cu @@ -15,9 +15,10 @@ */ #include "../decoderMaskedMultiheadAttentionLaunch.h" +#include "tensorrt_llm/common/config.h" + +TRTLLM_NAMESPACE_BEGIN -namespace tensorrt_llm -{ namespace kernels { @@ -38,4 +39,5 @@ INSTANTIATE_MMHA_LAUNCHERS(__nv_bfloat16, kSizePerHead) } // namespace mmha } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention48_float.cu b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention48_float.cu index 72c2ef160e..09c1d6f8f4 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention48_float.cu +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention48_float.cu @@ -15,9 +15,10 @@ */ #include "../decoderMaskedMultiheadAttentionLaunch.h" +#include "tensorrt_llm/common/config.h" + +TRTLLM_NAMESPACE_BEGIN -namespace tensorrt_llm -{ namespace kernels { @@ -36,4 +37,5 @@ INSTANTIATE_MMHA_LAUNCHERS(float, kSizePerHead) } // namespace mmha } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention48_half.cu b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention48_half.cu index df10f905de..96c271547a 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention48_half.cu +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention48_half.cu @@ -15,9 +15,10 @@ */ #include "../decoderMaskedMultiheadAttentionLaunch.h" +#include "tensorrt_llm/common/config.h" + +TRTLLM_NAMESPACE_BEGIN -namespace tensorrt_llm -{ namespace kernels { @@ -36,4 +37,5 @@ INSTANTIATE_MMHA_LAUNCHERS(uint16_t, kSizePerHead) } // namespace mmha } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention64_bf16.cu b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention64_bf16.cu index 90f338470e..0eb62b8567 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention64_bf16.cu +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention64_bf16.cu @@ -15,9 +15,10 @@ */ #include "../decoderMaskedMultiheadAttentionLaunch.h" +#include "tensorrt_llm/common/config.h" + +TRTLLM_NAMESPACE_BEGIN -namespace tensorrt_llm -{ namespace kernels { @@ -36,4 +37,5 @@ INSTANTIATE_MMHA_LAUNCHERS(__nv_bfloat16, kSizePerHead) } // namespace mmha } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention64_bf16_implicit_relative_attn.cu b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention64_bf16_implicit_relative_attn.cu index dbbccf2d0f..a739bafe59 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention64_bf16_implicit_relative_attn.cu +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention64_bf16_implicit_relative_attn.cu @@ -15,9 +15,10 @@ */ #include "../decoderMaskedMultiheadAttentionLaunch.h" +#include "tensorrt_llm/common/config.h" + +TRTLLM_NAMESPACE_BEGIN -namespace tensorrt_llm -{ namespace kernels { @@ -36,4 +37,5 @@ INSTANTIATE_MMHA_LAUNCHERS_WITH_IMPLICIT_REL_ATTN_BIAS(__nv_bfloat16, kSizePerHe } // namespace mmha } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention64_float.cu b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention64_float.cu index 775ed1038d..bb0b54ec88 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention64_float.cu +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention64_float.cu @@ -15,9 +15,10 @@ */ #include "../decoderMaskedMultiheadAttentionLaunch.h" +#include "tensorrt_llm/common/config.h" + +TRTLLM_NAMESPACE_BEGIN -namespace tensorrt_llm -{ namespace kernels { @@ -34,4 +35,5 @@ INSTANTIATE_MMHA_LAUNCHERS(float, kSizePerHead) } // namespace mmha } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention64_float_implicit_relative_attn.cu b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention64_float_implicit_relative_attn.cu index 87726296e3..ae3be8f097 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention64_float_implicit_relative_attn.cu +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention64_float_implicit_relative_attn.cu @@ -15,9 +15,10 @@ */ #include "../decoderMaskedMultiheadAttentionLaunch.h" +#include "tensorrt_llm/common/config.h" + +TRTLLM_NAMESPACE_BEGIN -namespace tensorrt_llm -{ namespace kernels { @@ -34,4 +35,5 @@ INSTANTIATE_MMHA_LAUNCHERS_WITH_IMPLICIT_REL_ATTN_BIAS(float, kSizePerHead) } // namespace mmha } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention64_half.cu b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention64_half.cu index 4d29cc40fa..77f0539380 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention64_half.cu +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention64_half.cu @@ -15,9 +15,10 @@ */ #include "../decoderMaskedMultiheadAttentionLaunch.h" +#include "tensorrt_llm/common/config.h" + +TRTLLM_NAMESPACE_BEGIN -namespace tensorrt_llm -{ namespace kernels { @@ -34,4 +35,5 @@ INSTANTIATE_MMHA_LAUNCHERS(uint16_t, kSizePerHead) } // namespace mmha } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention64_half_implicit_relative_attn.cu b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention64_half_implicit_relative_attn.cu index a247a07a3f..59caa0fae7 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention64_half_implicit_relative_attn.cu +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention64_half_implicit_relative_attn.cu @@ -15,9 +15,10 @@ */ #include "../decoderMaskedMultiheadAttentionLaunch.h" +#include "tensorrt_llm/common/config.h" + +TRTLLM_NAMESPACE_BEGIN -namespace tensorrt_llm -{ namespace kernels { @@ -34,4 +35,5 @@ INSTANTIATE_MMHA_LAUNCHERS_WITH_IMPLICIT_REL_ATTN_BIAS(uint16_t, kSizePerHead) } // namespace mmha } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention80_bf16.cu b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention80_bf16.cu index 11ecb92a66..8c564959d0 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention80_bf16.cu +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention80_bf16.cu @@ -15,9 +15,10 @@ */ #include "../decoderMaskedMultiheadAttentionLaunch.h" +#include "tensorrt_llm/common/config.h" + +TRTLLM_NAMESPACE_BEGIN -namespace tensorrt_llm -{ namespace kernels { @@ -38,4 +39,5 @@ INSTANTIATE_MMHA_LAUNCHERS(__nv_bfloat16, kSizePerHead) } // namespace mmha } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention80_float.cu b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention80_float.cu index af9f4f4fec..76e54cf297 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention80_float.cu +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention80_float.cu @@ -15,9 +15,10 @@ */ #include "../decoderMaskedMultiheadAttentionLaunch.h" +#include "tensorrt_llm/common/config.h" + +TRTLLM_NAMESPACE_BEGIN -namespace tensorrt_llm -{ namespace kernels { @@ -36,4 +37,5 @@ INSTANTIATE_MMHA_LAUNCHERS(float, kSizePerHead) } // namespace mmha } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention80_half.cu b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention80_half.cu index 3f8e9c4c23..c50b41b187 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention80_half.cu +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention80_half.cu @@ -15,9 +15,10 @@ */ #include "../decoderMaskedMultiheadAttentionLaunch.h" +#include "tensorrt_llm/common/config.h" + +TRTLLM_NAMESPACE_BEGIN -namespace tensorrt_llm -{ namespace kernels { @@ -36,4 +37,5 @@ INSTANTIATE_MMHA_LAUNCHERS(uint16_t, kSizePerHead) } // namespace mmha } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention96_bf16.cu b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention96_bf16.cu index 286ed2b2fb..3b6d1c6c0f 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention96_bf16.cu +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention96_bf16.cu @@ -15,9 +15,10 @@ */ #include "../decoderMaskedMultiheadAttentionLaunch.h" +#include "tensorrt_llm/common/config.h" + +TRTLLM_NAMESPACE_BEGIN -namespace tensorrt_llm -{ namespace kernels { @@ -38,4 +39,5 @@ INSTANTIATE_MMHA_LAUNCHERS(__nv_bfloat16, kSizePerHead) } // namespace mmha } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention96_float.cu b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention96_float.cu index ef886b9412..88217b08bc 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention96_float.cu +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention96_float.cu @@ -15,9 +15,10 @@ */ #include "../decoderMaskedMultiheadAttentionLaunch.h" +#include "tensorrt_llm/common/config.h" + +TRTLLM_NAMESPACE_BEGIN -namespace tensorrt_llm -{ namespace kernels { @@ -36,4 +37,5 @@ INSTANTIATE_MMHA_LAUNCHERS(float, kSizePerHead) } // namespace mmha } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention96_half.cu b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention96_half.cu index af8f7fa4d2..b1a188a6ea 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention96_half.cu +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention96_half.cu @@ -15,9 +15,10 @@ */ #include "../decoderMaskedMultiheadAttentionLaunch.h" +#include "tensorrt_llm/common/config.h" + +TRTLLM_NAMESPACE_BEGIN -namespace tensorrt_llm -{ namespace kernels { @@ -36,4 +37,5 @@ INSTANTIATE_MMHA_LAUNCHERS(uint16_t, kSizePerHead) } // namespace mmha } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/tensorMapUtils.cpp b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/tensorMapUtils.cpp index 6c2180ba80..e4b642a11e 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/tensorMapUtils.cpp +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/tensorMapUtils.cpp @@ -14,12 +14,15 @@ * limitations under the License. */ #include "tensorrt_llm/kernels/decoderMaskedMultiheadAttention/tensorMapUtils.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/kernels/kvCacheUtils.h" #include #include -namespace tensorrt_llm::kernels +TRTLLM_NAMESPACE_BEGIN + +namespace kernels { namespace @@ -183,4 +186,6 @@ CUtensorMap makeTensorMapForXqaMlaQ( return makeTensorMapForQ(driver, q, CU_TENSOR_MAP_DATA_TYPE_UINT8, xqaParams.head_size, xqaParams.num_q_heads * xqaParams.total_num_input_tokens, partElems, xqaParams.num_q_heads); } -} // namespace tensorrt_llm::kernels +} // namespace kernels + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/tensorMapUtils.h b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/tensorMapUtils.h index da4240d277..03b2373bcd 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/tensorMapUtils.h +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/tensorMapUtils.h @@ -14,11 +14,12 @@ * limitations under the License. */ #pragma once +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaDriverWrapper.h" #include "tensorrt_llm/kernels/decoderMaskedMultiheadAttention/xqaParams.h" -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { @@ -34,4 +35,5 @@ CUtensorMap makeTensorMapForXqaMlaQ( std::shared_ptr const& driver, XQAParams const& xqaParams, void const* q); } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/xqaParams.h b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/xqaParams.h index 35115b8cb6..6ac232e499 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/xqaParams.h +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/xqaParams.h @@ -14,13 +14,14 @@ * limitations under the License. */ #pragma once +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/quantization.h" #include "tensorrt_llm/kernels/gptKernels.h" #include "tensorrt_llm/kernels/multiHeadAttentionCommon.h" #include "tensorrt_llm/kernels/sparseAttentionKernels.h" -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { @@ -206,4 +207,5 @@ struct XQAParams }; } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttentionUtils.h b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttentionUtils.h index 09bd551c0b..aa7e31dbd1 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttentionUtils.h +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttentionUtils.h @@ -16,6 +16,7 @@ #pragma once +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaTypeUtils.cuh" #include "tensorrt_llm/kernels/gptKernels.h" #include @@ -31,8 +32,8 @@ using tensorrt_llm::common::float22bf162; using tensorrt_llm::common::hsub2; #endif -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { @@ -4256,4 +4257,5 @@ __device__ __host__ constexpr inline T const& const_max(T const& a, T const& b) } // namespace mmha } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/decodingCommon.cu b/cpp/tensorrt_llm/kernels/decodingCommon.cu index ad8249a3a6..1e091d6961 100644 --- a/cpp/tensorrt_llm/kernels/decodingCommon.cu +++ b/cpp/tensorrt_llm/kernels/decodingCommon.cu @@ -14,10 +14,11 @@ * limitations under the License. */ -#include "tensorrt_llm/kernels/decodingCommon.h" - +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaUtils.h" + #include "tensorrt_llm/common/reduceKernelUtils.cuh" +#include "tensorrt_llm/kernels/decodingCommon.h" #include "tensorrt_llm/runtime/common.h" #include @@ -25,7 +26,9 @@ using namespace tensorrt_llm::common; using namespace tensorrt_llm::runtime; -namespace tensorrt_llm::kernels +TRTLLM_NAMESPACE_BEGIN + +namespace kernels { __global__ void curandInitialize(curandState_t* state, int const* batchSlots, int const size, uint64_t const randomSeed) @@ -235,4 +238,6 @@ template void invokeScatterDecodingParams( template void invokeScatterDecodingParams( int32_t const* src, int32_t scalar, int32_t* dst, int const* batchSlots, int batchSize, cudaStream_t stream); -} // namespace tensorrt_llm::kernels +} // namespace kernels + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/decodingKernels.cu b/cpp/tensorrt_llm/kernels/decodingKernels.cu index 77bc6b71ae..98b25dde4c 100644 --- a/cpp/tensorrt_llm/kernels/decodingKernels.cu +++ b/cpp/tensorrt_llm/kernels/decodingKernels.cu @@ -15,6 +15,7 @@ */ #include "tensorrt_llm/common/assert.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaUtils.h" #include "tensorrt_llm/common/reduceKernelUtils.cuh" #include "tensorrt_llm/kernels/decodingKernels.h" @@ -30,8 +31,7 @@ using namespace tensorrt_llm::common; using namespace tensorrt_llm::runtime; -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN namespace kernels { @@ -712,7 +712,9 @@ void invokeTransposeLogProbs(float* outputLogProbs, float* outputLogProbsTiled, } // namespace kernels -namespace runtime::kernels +TRTLLM_NAMESPACE_END + +namespace tensorrt_llm::runtime::kernels { // Must be similar to [cpp/tensorrt_llm/thop/gatherTreeOp.cpp] gatherTree void gatherTree(DecodingOutput const& decodingOutput, DecodingInput const& decodingInput, @@ -802,6 +804,4 @@ void gatherTree(DecodingOutput const& decodingOutput, DecodingInput const& decod TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__); } -} // namespace runtime::kernels - -} // namespace tensorrt_llm +} // namespace tensorrt_llm::runtime::kernels diff --git a/cpp/tensorrt_llm/kernels/decodingKernels.h b/cpp/tensorrt_llm/kernels/decodingKernels.h index cf648c7605..0e4fded936 100644 --- a/cpp/tensorrt_llm/kernels/decodingKernels.h +++ b/cpp/tensorrt_llm/kernels/decodingKernels.h @@ -16,6 +16,7 @@ #pragma once +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/kernels/beamSearchKernels.h" #include "tensorrt_llm/runtime/common.h" #include "tensorrt_llm/runtime/decodingInput.h" @@ -25,8 +26,7 @@ #include #include -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN namespace kernels { @@ -117,7 +117,9 @@ void invokeTransposeLogProbs(float* output_log_probs, float* output_log_probs_ti } // namespace kernels -namespace runtime::kernels +TRTLLM_NAMESPACE_END + +namespace tensorrt_llm::runtime::kernels { //! \brief Inserts the running beams into the finished beams stored in the CBA buffers. (beams where the most likely //! continuation is the end token get stored separately, and another candidate next token is stored). Then sorts the @@ -132,6 +134,4 @@ namespace runtime::kernels void gatherTree(DecodingOutput const& decodingOutput, DecodingInput const& decodingInput, SamplingConfig const& samplingConfig, runtime::CudaStream const& cudaStream); -} // namespace runtime::kernels - -} // namespace tensorrt_llm +} // namespace tensorrt_llm::runtime::kernels diff --git a/cpp/tensorrt_llm/kernels/delayStream.cu b/cpp/tensorrt_llm/kernels/delayStream.cu index ec0146c4b8..89b4b2cca9 100644 --- a/cpp/tensorrt_llm/kernels/delayStream.cu +++ b/cpp/tensorrt_llm/kernels/delayStream.cu @@ -13,12 +13,15 @@ * See the License for the specific language governing permissions and * limitations under the License. */ +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaUtils.h" #include "tensorrt_llm/kernels/delayStream.h" using namespace tensorrt_llm::common; -namespace tensorrt_llm::kernels +TRTLLM_NAMESPACE_BEGIN + +namespace kernels { __global__ void delayStreamKernel(long long delay_micro_secs) { @@ -34,4 +37,6 @@ void invokeDelayStreamKernel(long long delay_micro_secs, cudaStream_t stream) delayStreamKernel<<<1, 1, 0, stream>>>(delay_micro_secs); check_cuda_error(cudaGetLastError()); } -} // namespace tensorrt_llm::kernels +} // namespace kernels + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/delayStream.h b/cpp/tensorrt_llm/kernels/delayStream.h index 8266416da6..65035e3a82 100644 --- a/cpp/tensorrt_llm/kernels/delayStream.h +++ b/cpp/tensorrt_llm/kernels/delayStream.h @@ -16,9 +16,14 @@ #pragma once +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaUtils.h" -namespace tensorrt_llm::kernels +TRTLLM_NAMESPACE_BEGIN + +namespace kernels { void invokeDelayStreamKernel(long long delay_micro_secs, cudaStream_t stream); -} // namespace tensorrt_llm::kernels +} // namespace kernels + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/doraScaling.cu b/cpp/tensorrt_llm/kernels/doraScaling.cu index c2308f0874..bd441cfb49 100644 --- a/cpp/tensorrt_llm/kernels/doraScaling.cu +++ b/cpp/tensorrt_llm/kernels/doraScaling.cu @@ -14,12 +14,15 @@ * limitations under the License. */ +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaBf16Wrapper.h" #include "tensorrt_llm/common/cudaUtils.h" // TODO(oargov): literally zero performance optimization work was put into these kernels and their launch parameters, // since they should hopefully be fused to some gemm eventually. -namespace tensorrt_llm::kernels +TRTLLM_NAMESPACE_BEGIN + +namespace kernels { template __global__ void tokenPerChannelScaleKernel(size_t const numModules, size_t const numTokens, @@ -89,4 +92,6 @@ template void tokenPerChannelScale(int64_t const numel, size_t cons nv_bfloat16 const* const* __restrict__ scale_ptrs, nv_bfloat16* __restrict__ result, cudaStream_t stream); #endif -} // namespace tensorrt_llm::kernels +} // namespace kernels + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/doraScaling.h b/cpp/tensorrt_llm/kernels/doraScaling.h index 4b24f26ff2..9df8661e07 100644 --- a/cpp/tensorrt_llm/kernels/doraScaling.h +++ b/cpp/tensorrt_llm/kernels/doraScaling.h @@ -15,14 +15,16 @@ */ #pragma once +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaUtils.h" -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { template void tokenPerChannelScale(int64_t const numel, size_t const numModules, size_t const numGroups, int64_t const* __restrict__ cumModuleSizes, T const* a, T const* const* scale_ptrs, T* result, cudaStream_t stream); } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/dsv3MinLatencyKernels/dsv3FusedAGemm.cu b/cpp/tensorrt_llm/kernels/dsv3MinLatencyKernels/dsv3FusedAGemm.cu index 1480be8140..8e8e819117 100644 --- a/cpp/tensorrt_llm/kernels/dsv3MinLatencyKernels/dsv3FusedAGemm.cu +++ b/cpp/tensorrt_llm/kernels/dsv3MinLatencyKernels/dsv3FusedAGemm.cu @@ -21,6 +21,7 @@ #include "cuda.h" #include "cuda_bf16.h" #include "cuda_runtime.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/envUtils.h" #include "tensorrt_llm/kernels/dsv3MinLatencyKernels/dsv3FusedAGemm.h" @@ -29,7 +30,9 @@ using bf16_t = __nv_bfloat16; using namespace tensorrt_llm::common; -namespace tensorrt_llm::kernels::dsv3MinLatencyKernels +TRTLLM_NAMESPACE_BEGIN + +namespace kernels::dsv3MinLatencyKernels { __device__ void hmma_16_8_16_f32acc_bf16ab( @@ -681,4 +684,6 @@ template void invokeFusedAGemm<__nv_bfloat16, 7168, 2112, 8>( template void invokeFusedAGemm<__nv_bfloat16, 7168, 2112, 16>( __nv_bfloat16*, __nv_bfloat16 const*, __nv_bfloat16 const*, int num_tokens, cudaStream_t); -} // namespace tensorrt_llm::kernels::dsv3MinLatencyKernels +} // namespace kernels::dsv3MinLatencyKernels + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/dsv3MinLatencyKernels/dsv3FusedAGemm.h b/cpp/tensorrt_llm/kernels/dsv3MinLatencyKernels/dsv3FusedAGemm.h index 36548da54c..6adaec89da 100644 --- a/cpp/tensorrt_llm/kernels/dsv3MinLatencyKernels/dsv3FusedAGemm.h +++ b/cpp/tensorrt_llm/kernels/dsv3MinLatencyKernels/dsv3FusedAGemm.h @@ -17,15 +17,20 @@ #pragma once +#include "tensorrt_llm/common/config.h" #include #include #include "tensorrt_llm/common/cudaUtils.h" -namespace tensorrt_llm::kernels::dsv3MinLatencyKernels +TRTLLM_NAMESPACE_BEGIN + +namespace kernels::dsv3MinLatencyKernels { template void invokeFusedAGemm(T* output, T const* mat_a, T const* mat_b, int num_tokens, cudaStream_t const stream); -} // namespace tensorrt_llm::kernels::dsv3MinLatencyKernels +} // namespace kernels::dsv3MinLatencyKernels + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/dsv3MinLatencyKernels/dsv3RouterGemm.cu b/cpp/tensorrt_llm/kernels/dsv3MinLatencyKernels/dsv3RouterGemm.cu index 34557cc490..0b406e103f 100644 --- a/cpp/tensorrt_llm/kernels/dsv3MinLatencyKernels/dsv3RouterGemm.cu +++ b/cpp/tensorrt_llm/kernels/dsv3MinLatencyKernels/dsv3RouterGemm.cu @@ -14,11 +14,15 @@ * limitations under the License. */ +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/envUtils.h" + #include "tensorrt_llm/kernels/dsv3MinLatencyKernels/dsv3RouterGemm.h" using namespace tensorrt_llm::common; -namespace tensorrt_llm::kernels::dsv3MinLatencyKernels +TRTLLM_NAMESPACE_BEGIN + +namespace kernels::dsv3MinLatencyKernels { // Custom FMA implementation using PTX assembly instructions @@ -238,4 +242,6 @@ template void tensorrt_llm::kernels::dsv3MinLatencyKernels::invokeRouterGemm<__n template void tensorrt_llm::kernels::dsv3MinLatencyKernels::invokeRouterGemm<__nv_bfloat16, 16, 256, 7168>( float*, __nv_bfloat16 const*, __nv_bfloat16 const*, cudaStream_t); -} // namespace tensorrt_llm::kernels::dsv3MinLatencyKernels +} // namespace kernels::dsv3MinLatencyKernels + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/dsv3MinLatencyKernels/dsv3RouterGemm.h b/cpp/tensorrt_llm/kernels/dsv3MinLatencyKernels/dsv3RouterGemm.h index 948b1ef8d4..ffd77cf12a 100644 --- a/cpp/tensorrt_llm/kernels/dsv3MinLatencyKernels/dsv3RouterGemm.h +++ b/cpp/tensorrt_llm/kernels/dsv3MinLatencyKernels/dsv3RouterGemm.h @@ -16,15 +16,20 @@ #pragma once +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaUtils.h" #include #include #include -namespace tensorrt_llm::kernels::dsv3MinLatencyKernels +TRTLLM_NAMESPACE_BEGIN + +namespace kernels::dsv3MinLatencyKernels { template void invokeRouterGemm(float* output, T const* mat_a, T const* mat_b, cudaStream_t stream); -} // namespace tensorrt_llm::kernels::dsv3MinLatencyKernels +} // namespace kernels::dsv3MinLatencyKernels + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/fmhaDispatcher.cpp b/cpp/tensorrt_llm/kernels/fmhaDispatcher.cpp index b46564d49a..4103729940 100644 --- a/cpp/tensorrt_llm/kernels/fmhaDispatcher.cpp +++ b/cpp/tensorrt_llm/kernels/fmhaDispatcher.cpp @@ -15,9 +15,12 @@ */ #include "fmhaDispatcher.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaUtils.h" -namespace tensorrt_llm::kernels +TRTLLM_NAMESPACE_BEGIN + +namespace kernels { //////////////////////////////////////////////////////////////////////////////////////////////////// @@ -247,4 +250,6 @@ void FmhaDispatcher::run(MHARunnerParams runnerParams) //////////////////////////////////////////////////////////////////////////////////////////////////// -} // namespace tensorrt_llm::kernels +} // namespace kernels + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/fmhaDispatcher.h b/cpp/tensorrt_llm/kernels/fmhaDispatcher.h index f79c55d380..26a40411fd 100644 --- a/cpp/tensorrt_llm/kernels/fmhaDispatcher.h +++ b/cpp/tensorrt_llm/kernels/fmhaDispatcher.h @@ -16,6 +16,7 @@ #pragma once +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/opUtils.h" #include "tensorrt_llm/kernels/contextFusedMultiHeadAttention/fmhaRunner.h" #include "tensorrt_llm/kernels/contextFusedMultiHeadAttention/fused_multihead_attention_common.h" @@ -23,7 +24,9 @@ using tensorrt_llm::common::op::UniqPtrWNullCopy; -namespace tensorrt_llm::kernels +TRTLLM_NAMESPACE_BEGIN + +namespace kernels { //////////////////////////////////////////////////////////////////////////////////////////////////// @@ -62,4 +65,6 @@ private: //////////////////////////////////////////////////////////////////////////////////////////////////// -} // namespace tensorrt_llm::kernels +} // namespace kernels + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/fusedLayernormKernels/fp4_converter.cuh b/cpp/tensorrt_llm/kernels/fusedLayernormKernels/fp4_converter.cuh index 13de943b43..eda5f38d31 100644 --- a/cpp/tensorrt_llm/kernels/fusedLayernormKernels/fp4_converter.cuh +++ b/cpp/tensorrt_llm/kernels/fusedLayernormKernels/fp4_converter.cuh @@ -15,6 +15,7 @@ */ #pragma once +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaBf16Fallbacks.cuh" #include "tensorrt_llm/common/cudaBufferUtils.cuh" #include "tensorrt_llm/common/cudaFp8Utils.h" @@ -29,7 +30,9 @@ using namespace tensorrt_llm::common; -namespace tensorrt_llm::kernels +TRTLLM_NAMESPACE_BEGIN + +namespace kernels { template @@ -261,4 +264,6 @@ struct FP4Converter } }; -} // namespace tensorrt_llm::kernels +} // namespace kernels + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/fusedLayernormKernels/layernorm_param.h b/cpp/tensorrt_llm/kernels/fusedLayernormKernels/layernorm_param.h index 22c1dc40ed..0e05e0a835 100644 --- a/cpp/tensorrt_llm/kernels/fusedLayernormKernels/layernorm_param.h +++ b/cpp/tensorrt_llm/kernels/fusedLayernormKernels/layernorm_param.h @@ -16,10 +16,13 @@ #pragma once +#include "tensorrt_llm/common/config.h" #include #include -namespace tensorrt_llm::kernels +TRTLLM_NAMESPACE_BEGIN + +namespace kernels { template @@ -41,4 +44,6 @@ struct GeneralFP4AddBiasResidualPreLayerNormParam cudaStream_t stream; }; -} // namespace tensorrt_llm::kernels +} // namespace kernels + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/fusedLayernormKernels/low_latency_layernorm.cuh b/cpp/tensorrt_llm/kernels/fusedLayernormKernels/low_latency_layernorm.cuh index a9cf71a2a8..1e2ebd62d0 100644 --- a/cpp/tensorrt_llm/kernels/fusedLayernormKernels/low_latency_layernorm.cuh +++ b/cpp/tensorrt_llm/kernels/fusedLayernormKernels/low_latency_layernorm.cuh @@ -16,6 +16,7 @@ #pragma once +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaBf16Fallbacks.cuh" #include "tensorrt_llm/common/cudaBufferUtils.cuh" #include "tensorrt_llm/common/cudaFp8Utils.h" @@ -27,7 +28,9 @@ using namespace tensorrt_llm::common; -namespace tensorrt_llm::kernels +TRTLLM_NAMESPACE_BEGIN + +namespace kernels { template @@ -333,4 +336,6 @@ struct LowLatencyLayerNorm } }; -} // namespace tensorrt_llm::kernels +} // namespace kernels + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/fusedLayernormKernels/ws_layernorm.cuh b/cpp/tensorrt_llm/kernels/fusedLayernormKernels/ws_layernorm.cuh index 51c6ca7564..5776c41119 100644 --- a/cpp/tensorrt_llm/kernels/fusedLayernormKernels/ws_layernorm.cuh +++ b/cpp/tensorrt_llm/kernels/fusedLayernormKernels/ws_layernorm.cuh @@ -15,6 +15,7 @@ */ #pragma once +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaBf16Fallbacks.cuh" #include "tensorrt_llm/common/cudaBufferUtils.cuh" #include "tensorrt_llm/common/cudaFp8Utils.h" @@ -25,7 +26,9 @@ using namespace tensorrt_llm::common; -namespace tensorrt_llm::kernels +TRTLLM_NAMESPACE_BEGIN + +namespace kernels { struct DummyFusedOperator @@ -838,4 +841,6 @@ __global__ void __launch_bounds__(TARGET_THREADS, 1) warpSpecializedInvoker(type T::run(param); } -} // namespace tensorrt_llm::kernels +} // namespace kernels + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/fusedLayernormKernels/ws_layernorm.h b/cpp/tensorrt_llm/kernels/fusedLayernormKernels/ws_layernorm.h old mode 100755 new mode 100644 index b5c00f90ce..c7579251fb --- a/cpp/tensorrt_llm/kernels/fusedLayernormKernels/ws_layernorm.h +++ b/cpp/tensorrt_llm/kernels/fusedLayernormKernels/ws_layernorm.h @@ -15,9 +15,12 @@ */ #pragma once +#include "tensorrt_llm/common/config.h" #include -namespace tensorrt_llm::kernels +TRTLLM_NAMESPACE_BEGIN + +namespace kernels { struct WarpSpecializedCounters @@ -43,4 +46,6 @@ enum class SCALE_TYPE template void invokeWSLayerNorm(WarpSpecializedParam param, bool use_rms_norm, int ctas); -} // namespace tensorrt_llm::kernels +} // namespace kernels + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/fusedLayernormKernels/ws_layernorm_fp4_traits.cu b/cpp/tensorrt_llm/kernels/fusedLayernormKernels/ws_layernorm_fp4_traits.cu index 4dc10f05e7..9103491cdd 100644 --- a/cpp/tensorrt_llm/kernels/fusedLayernormKernels/ws_layernorm_fp4_traits.cu +++ b/cpp/tensorrt_llm/kernels/fusedLayernormKernels/ws_layernorm_fp4_traits.cu @@ -14,6 +14,7 @@ * limitations under the License. */ +#include "tensorrt_llm/common/config.h" #include #include "tensorrt_llm/common/logger.h" @@ -25,7 +26,9 @@ using namespace tensorrt_llm::kernels; using namespace tensorrt_llm::common; -namespace tensorrt_llm::kernels +TRTLLM_NAMESPACE_BEGIN + +namespace kernels { template @@ -317,4 +320,6 @@ void invokeWSLayerNorm invokeWSLayerNormImpl(param, use_rms_norm, ctas); } -} // namespace tensorrt_llm::kernels +} // namespace kernels + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/fusedMoeCommKernels.cu b/cpp/tensorrt_llm/kernels/fusedMoeCommKernels.cu index 7b53818762..633b276b12 100644 --- a/cpp/tensorrt_llm/kernels/fusedMoeCommKernels.cu +++ b/cpp/tensorrt_llm/kernels/fusedMoeCommKernels.cu @@ -14,16 +14,17 @@ * limitations under the License. */ -#include "tensorrt_llm/kernels/fusedMoeCommKernels.h" +#include "tensorrt_llm/common/config.h" +#include "tensorrt_llm/common/cudaUtils.h" #include -#include "tensorrt_llm/common/cudaUtils.h" #include "tensorrt_llm/common/logger.h" +#include "tensorrt_llm/kernels/fusedMoeCommKernels.h" #include "tensorrt_llm/kernels/quantization.cuh" -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { @@ -1779,4 +1780,5 @@ void launchLocalFifoSendRecv(FusedMoeFieldInfo const& sendFieldInfo, FusedMoeFie } // namespace fused_moe_comm_tests } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/fusedMoeCommKernels.h b/cpp/tensorrt_llm/kernels/fusedMoeCommKernels.h index 7a17257bff..31aab22507 100644 --- a/cpp/tensorrt_llm/kernels/fusedMoeCommKernels.h +++ b/cpp/tensorrt_llm/kernels/fusedMoeCommKernels.h @@ -19,12 +19,13 @@ #include +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaUtils.h" #include "tensorrt_llm/common/envUtils.h" #include "tensorrt_llm/kernels/moeCommKernelsCommon.h" -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { @@ -558,4 +559,5 @@ void launchLocalFifoSendRecv(FusedMoeFieldInfo const& sendFieldInfo, FusedMoeFie } // namespace fused_moe_comm_tests } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/fusedQKNormRopeKernel.cu b/cpp/tensorrt_llm/kernels/fusedQKNormRopeKernel.cu index 80245d0b52..73326af8c4 100644 --- a/cpp/tensorrt_llm/kernels/fusedQKNormRopeKernel.cu +++ b/cpp/tensorrt_llm/kernels/fusedQKNormRopeKernel.cu @@ -15,6 +15,7 @@ */ #include "fusedQKNormRopeKernel.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaUtils.h" #include "tensorrt_llm/common/mathUtils.h" #include "tensorrt_llm/common/reduceKernelUtils.cuh" @@ -24,7 +25,9 @@ #include #include -namespace tensorrt_llm::common +TRTLLM_NAMESPACE_BEGIN + +namespace common { // Specialization for packed_as used in this kernel. template <> @@ -44,9 +47,12 @@ struct packed_as { using type = uint4; }; -} // namespace tensorrt_llm::common +} // namespace common -namespace tensorrt_llm::kernels +TRTLLM_NAMESPACE_END +TRTLLM_NAMESPACE_BEGIN + +namespace kernels { //////////////////////////////////////////////////////////////////////////////////////////////////// @@ -327,4 +333,6 @@ void launchFusedQKNormRope(void* qkv, int const num_tokens, int const num_heads_ default: TLLM_THROW("Unsupported head dimension for fusedQKNormRope: %d", head_dim); } } -} // namespace tensorrt_llm::kernels +} // namespace kernels + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/fusedQKNormRopeKernel.h b/cpp/tensorrt_llm/kernels/fusedQKNormRopeKernel.h index 85d71f7e7c..7dab7dbbb2 100644 --- a/cpp/tensorrt_llm/kernels/fusedQKNormRopeKernel.h +++ b/cpp/tensorrt_llm/kernels/fusedQKNormRopeKernel.h @@ -16,10 +16,11 @@ #pragma once +#include "tensorrt_llm/common/config.h" #include -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { @@ -46,4 +47,5 @@ void launchFusedQKNormRope( bool is_qk_norm); } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/gptKernels.cu b/cpp/tensorrt_llm/kernels/gptKernels.cu index 7d6332d1a4..082709e7af 100644 --- a/cpp/tensorrt_llm/kernels/gptKernels.cu +++ b/cpp/tensorrt_llm/kernels/gptKernels.cu @@ -14,6 +14,7 @@ * limitations under the License. */ #include "tensorrt_llm/common/assert.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaBf16Wrapper.h" #include "tensorrt_llm/common/cudaFp8Utils.h" #include "tensorrt_llm/common/cudaUtils.h" @@ -26,8 +27,8 @@ using namespace tensorrt_llm::common; -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { @@ -358,4 +359,5 @@ __global__ void updatePaddingCountKernel(int* paddingPerSeq, int const* seqLengt } } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/gptKernels.h b/cpp/tensorrt_llm/kernels/gptKernels.h index 38c56be902..f5ba9a1b76 100644 --- a/cpp/tensorrt_llm/kernels/gptKernels.h +++ b/cpp/tensorrt_llm/kernels/gptKernels.h @@ -15,6 +15,7 @@ */ #pragma once +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/kernels/contextFusedMultiHeadAttention/fused_multihead_attention_common.h" #include "tensorrt_llm/runtime/iTensor.h" #include @@ -22,8 +23,8 @@ #include #include -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { @@ -275,4 +276,5 @@ template void invokeBuildDecoderInfo(BuildDecoderInfoParams const& params, cudaStream_t stream); } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/groupGemm.cu b/cpp/tensorrt_llm/kernels/groupGemm.cu index 5305e85a4f..5b8c0d9291 100644 --- a/cpp/tensorrt_llm/kernels/groupGemm.cu +++ b/cpp/tensorrt_llm/kernels/groupGemm.cu @@ -24,12 +24,13 @@ #include "groupGemm.h" #include "tensorrt_llm/common/assert.h" -#include "tensorrt_llm/common/memoryUtils.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaUtils.h" +#include "tensorrt_llm/common/memoryUtils.h" + +TRTLLM_NAMESPACE_BEGIN -namespace tensorrt_llm -{ namespace kernels { @@ -259,4 +260,4 @@ void groupedGemm(std::vector problem_sizes, std::vecto } // namespace kernels -} // namespace tensorrt_llm +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/groupGemm.h b/cpp/tensorrt_llm/kernels/groupGemm.h index 0fabcb9562..dbc1e498b7 100644 --- a/cpp/tensorrt_llm/kernels/groupGemm.h +++ b/cpp/tensorrt_llm/kernels/groupGemm.h @@ -16,10 +16,11 @@ #pragma once #include "cutlass/gemm_coord.h" +#include "tensorrt_llm/common/config.h" #include -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { @@ -32,4 +33,4 @@ void groupedGemm(std::vector problem_sizes, std::vecto } // namespace kernels -} // namespace tensorrt_llm +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/groupRmsNormKernels/groupRmsNormKernels.cu b/cpp/tensorrt_llm/kernels/groupRmsNormKernels/groupRmsNormKernels.cu index b13c8e100f..58b6bc9d8f 100644 --- a/cpp/tensorrt_llm/kernels/groupRmsNormKernels/groupRmsNormKernels.cu +++ b/cpp/tensorrt_llm/kernels/groupRmsNormKernels/groupRmsNormKernels.cu @@ -13,6 +13,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ +#include "tensorrt_llm/common/config.h" #include #include #include @@ -23,7 +24,9 @@ #include "tensorrt_llm/common/reduceKernelUtils.cuh" #include "tensorrt_llm/kernels/groupRmsNormKernels/groupRmsNormKernels.h" -namespace tensorrt_llm::kernels::group_rms_norm +TRTLLM_NAMESPACE_BEGIN + +namespace kernels::group_rms_norm { // Helper function to calculate the number of warps to launch for GroupRMSNormBase template @@ -876,4 +879,6 @@ void GroupRMSNormKernelLauncherWithHeuristic(GroupRMSParams& params) INSTANTIATE_GROUP_RMS_NORM_WITH_HEURISTIC(1) INSTANTIATE_GROUP_RMS_NORM_WITH_HEURISTIC(2) -} // namespace tensorrt_llm::kernels::group_rms_norm +} // namespace kernels::group_rms_norm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/groupRmsNormKernels/groupRmsNormKernels.h b/cpp/tensorrt_llm/kernels/groupRmsNormKernels/groupRmsNormKernels.h index c121705f6d..335adf44ed 100644 --- a/cpp/tensorrt_llm/kernels/groupRmsNormKernels/groupRmsNormKernels.h +++ b/cpp/tensorrt_llm/kernels/groupRmsNormKernels/groupRmsNormKernels.h @@ -14,15 +14,18 @@ * limitations under the License. */ #pragma once +#include "tensorrt_llm/common/assert.h" #include #include #include #include -#include "tensorrt_llm/common/assert.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaUtils.h" -namespace tensorrt_llm::kernels::group_rms_norm +TRTLLM_NAMESPACE_BEGIN + +namespace kernels::group_rms_norm { template @@ -73,4 +76,6 @@ void GroupRMSNormKernelLargeBatchLauncher(GroupRMSParams& params); template void GroupRMSNormKernelLauncherWithHeuristic(GroupRMSParams& params); -} // namespace tensorrt_llm::kernels::group_rms_norm +} // namespace kernels::group_rms_norm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/helixKernels.cu b/cpp/tensorrt_llm/kernels/helixKernels.cu index c08b244de9..ffaa490b14 100644 --- a/cpp/tensorrt_llm/kernels/helixKernels.cu +++ b/cpp/tensorrt_llm/kernels/helixKernels.cu @@ -14,6 +14,7 @@ * limitations under the License. */ +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/envUtils.h" #include "tensorrt_llm/kernels/helixKernels.h" @@ -29,8 +30,8 @@ using namespace tensorrt_llm::common; namespace cg = cooperative_groups; -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { static constexpr int WARP_SIZE = 32; @@ -240,4 +241,5 @@ INSTANTIATE_POST_PROC(__half); INSTANTIATE_POST_PROC(__nv_bfloat16); } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/helixKernels.h b/cpp/tensorrt_llm/kernels/helixKernels.h index 2a0e632434..d7b96e32bd 100644 --- a/cpp/tensorrt_llm/kernels/helixKernels.h +++ b/cpp/tensorrt_llm/kernels/helixKernels.h @@ -16,6 +16,7 @@ #pragma once +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaUtils.h" #include @@ -23,8 +24,8 @@ #include #include -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { template @@ -43,4 +44,5 @@ template void helixPostProcess(HelixPostProcParams const& params, cudaStream_t stream); } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/indexerKCacheScatter.cu b/cpp/tensorrt_llm/kernels/indexerKCacheScatter.cu index 3cb35273a9..3132d166f6 100644 --- a/cpp/tensorrt_llm/kernels/indexerKCacheScatter.cu +++ b/cpp/tensorrt_llm/kernels/indexerKCacheScatter.cu @@ -16,9 +16,12 @@ #include "IndexerKCacheScatter.h" #include "tensorrt_llm/common/assert.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaUtils.h" -namespace tensorrt_llm::kernels +TRTLLM_NAMESPACE_BEGIN + +namespace kernels { namespace @@ -149,4 +152,6 @@ void invokeIndexerKCacheScatter(uint8_t const* k_fp8_bytes, uint8_t const* k_sca TLLM_CUDA_CHECK(cudaGetLastError()); } -} // namespace tensorrt_llm::kernels +} // namespace kernels + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/indexerTopK.cu b/cpp/tensorrt_llm/kernels/indexerTopK.cu index 40e377c998..740e83f0bb 100644 --- a/cpp/tensorrt_llm/kernels/indexerTopK.cu +++ b/cpp/tensorrt_llm/kernels/indexerTopK.cu @@ -16,6 +16,7 @@ */ #include "moeTopKFuncs.cuh" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaTypeUtils.cuh" #include "tensorrt_llm/common/envUtils.h" #include "tensorrt_llm/kernels/noAuxTcKernels.h" @@ -25,7 +26,9 @@ namespace cg = cooperative_groups; using namespace tensorrt_llm::common; -namespace tensorrt_llm::kernels +TRTLLM_NAMESPACE_BEGIN + +namespace kernels { namespace { @@ -766,4 +769,6 @@ void invokeIndexerTopKPrefill(float const* logits, int const* rowStarts, int con sync_check_cuda_error(stream); } -} // namespace tensorrt_llm::kernels +} // namespace kernels + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/aarch64-linux-gnu/tensorrt_llm_internal_cutlass_kernels_static.tar.xz b/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/aarch64-linux-gnu/tensorrt_llm_internal_cutlass_kernels_static.tar.xz index 6f777b25ff..ac28ba8f9f 100644 --- a/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/aarch64-linux-gnu/tensorrt_llm_internal_cutlass_kernels_static.tar.xz +++ b/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/aarch64-linux-gnu/tensorrt_llm_internal_cutlass_kernels_static.tar.xz @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0482a61bb6d9435386aa5dcf155145e51cc6f820bfc52ffdecb0dd12c0368ae4 -size 67086296 +oid sha256:0a345d90233d94c0b3f6b9f5c6e79152852354e174f0edd68f00c2554e9e32b5 +size 67111548 diff --git a/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/aarch64-linux-gnu/version.txt b/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/aarch64-linux-gnu/version.txt index 4563244946..eb6005bb71 100644 --- a/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/aarch64-linux-gnu/version.txt +++ b/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/aarch64-linux-gnu/version.txt @@ -1,2 +1,2 @@ -40a3ef577419b5a9c6d5ca0d3201603889622eb62048319f657cbffc2c076be3 libtensorrt_llm_internal_cutlass_kernels_static.a -commit 33f251e0599197ad3e6c59d64a42f9721d3cc27c +389ecc2585d407dcf336cfb5d1fdf7cdf77922998b0560743c5b162172fa57c1 libtensorrt_llm_internal_cutlass_kernels_static.a +commit 9fc66c405c7caaaeb65542ba1498f00d863f0a4a diff --git a/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/include/allreduce_gemm_runner.h b/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/include/allreduce_gemm_runner.h index b7eba1ab34..09c1fbd586 100644 --- a/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/include/allreduce_gemm_runner.h +++ b/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/include/allreduce_gemm_runner.h @@ -14,6 +14,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + #pragma once #include @@ -23,18 +24,29 @@ #include "cutlass/layout/layout.h" #include "cutlass_extensions/gemm_configs.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/kernels/cutlass_kernels/cutlass_type_conversion.h" -namespace tensorrt_llm::kernels::cutlass_kernels -{ using namespace cute; using namespace tensorrt_llm::cutlass_extensions; +TRTLLM_NAMESPACE_BEGIN + +namespace kernels::cutlass_kernels +{ enum GemmAllReduceImpl { kNVLS_2SHOT }; +// Specifies whether to use SM or switch for allreduce. +// SM is more efficient for GPUs=2 and switch for GPUs>2. +enum ReduceLocationType +{ + kSM, + kSWITCH +}; + // Decouples IPluginResource from the GemmAllReduce runner interface. class PersistentWorkspaceInterface { @@ -42,7 +54,6 @@ public: virtual ~PersistentWorkspaceInterface() = default; virtual void allocate() = 0; virtual int free() = 0; - virtual size_t size() = 0; }; class GemmAllReduceImplInterface @@ -55,6 +66,7 @@ public: { GemmAllReduceImpl impl; MainloopScheduleType schedule; + ReduceLocationType reduce_location; TileShape tile_shape; ClusterShape cluster_shape; int MMA_SMs; @@ -71,10 +83,21 @@ public: return ""; }; + auto get_reduction_name = [&]() + { + switch (reduce_location) + { + case ReduceLocationType::kSM: return "SM"; + case ReduceLocationType::kSWITCH: return "Switch"; + } + return ""; + }; + std::stringstream ss; ss << "LaunchConfig("; ss << get_impl_name(); ss << ", Schedule_" << get_mainloop_schedule_name(schedule); + ss << ", Reduction_" << get_reduction_name(); ss << ", TileShape_" << get_tile_shape_name(tile_shape); ss << ", ClusterShape_" << get_cluster_shape_name(cluster_shape); ss << ", MmaSms_" << MMA_SMs; @@ -84,8 +107,8 @@ public: bool operator<(LaunchConfig const& other) const { - return std::tie(impl, schedule, tile_shape, cluster_shape, MMA_SMs) - < std::tie(other.impl, other.schedule, other.tile_shape, other.cluster_shape, other.MMA_SMs); + return std::tie(impl, schedule, reduce_location, tile_shape, cluster_shape, MMA_SMs) < std::tie(other.impl, + other.schedule, other.reduce_location, other.tile_shape, other.cluster_shape, other.MMA_SMs); } }; @@ -248,4 +271,6 @@ private: std::map mGemmRegistry; }; -} // namespace tensorrt_llm::kernels::cutlass_kernels +} // namespace kernels::cutlass_kernels + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/include/fp4_gemm.h b/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/include/fp4_gemm.h index 25b4aff8f3..37f55f3edd 100644 --- a/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/include/fp4_gemm.h +++ b/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/include/fp4_gemm.h @@ -21,13 +21,14 @@ #include #include "cutlass_extensions/gemm_configs.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/quantization.h" namespace tk = tensorrt_llm::common; namespace tkc = tensorrt_llm::cutlass_extensions; -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { namespace internal_cutlass_kernels @@ -98,4 +99,5 @@ private: } // namespace internal_cutlass_kernels } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/include/low_latency_gemm.h b/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/include/low_latency_gemm.h index fed9276e03..6cb38013c4 100644 --- a/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/include/low_latency_gemm.h +++ b/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/include/low_latency_gemm.h @@ -18,17 +18,14 @@ #pragma once #include "cutlass_extensions/gemm_configs.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaUtils.h" #include #include -// namespace tk = tensorrt_llm::common; +TRTLLM_NAMESPACE_BEGIN -namespace tkc = tensorrt_llm::cutlass_extensions; - -namespace tensorrt_llm -{ namespace kernels { namespace internal_cutlass_kernels @@ -127,4 +124,4 @@ private: }; // namespace internal_cutlass_kernels }; // namespace kernels -}; // namespace tensorrt_llm +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/include/low_latency_gemm_swiglu.h b/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/include/low_latency_gemm_swiglu.h index 9b6e4f042f..ed52b52928 100644 --- a/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/include/low_latency_gemm_swiglu.h +++ b/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/include/low_latency_gemm_swiglu.h @@ -17,13 +17,14 @@ #pragma once #include "low_latency_gemm.h" +#include "tensorrt_llm/common/config.h" // namespace tk = tensorrt_llm::common; namespace tkc = tensorrt_llm::cutlass_extensions; -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { namespace internal_cutlass_kernels @@ -73,4 +74,5 @@ private: }; // namespace internal_cutlass_kernels }; // namespace kernels -}; // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/include/moe_gemm_kernels.h b/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/include/moe_gemm_kernels.h index b00fa18e11..e3d62ef3b7 100644 --- a/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/include/moe_gemm_kernels.h +++ b/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/include/moe_gemm_kernels.h @@ -16,6 +16,7 @@ */ #pragma once +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaFp8Utils.h" #include "tensorrt_llm/common/workspace.h" #include "tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm_configs.h" @@ -37,9 +38,7 @@ #include #endif -namespace tensorrt_llm -{ - +TRTLLM_NAMESPACE_BEGIN // Note update moe.py to match enum class ActivationType { @@ -50,7 +49,6 @@ enum class ActivationType Geglu, SwigluBias, Identity, - Relu2, InvalidType }; @@ -196,8 +194,7 @@ struct TmaWarpSpecializedGroupedGemmInput struct INT4GroupwiseParams { - constexpr static int int4_group_size = 128; - constexpr static int wfp4a16_group_size = 32; + constexpr static int group_size = 128; // Unused, hard-coded to 128 bool enabled = false; using SFA = __nv_bfloat16; using SFB = __nv_bfloat16; // Unused @@ -266,6 +263,7 @@ public: #else static constexpr bool use_fp8 = false; static constexpr bool use_w4afp8 = false; + static constexpr bool use_wfp4afp4 = false; #endif #if defined(ENABLE_FP4) @@ -316,4 +314,4 @@ private: size_t calcMaxWorkspaceSize(int num_experts) const; }; -} // namespace tensorrt_llm +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/include/moe_kernels.h b/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/include/moe_kernels.h index a68e0b9bfe..132990603d 100644 --- a/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/include/moe_kernels.h +++ b/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/include/moe_kernels.h @@ -19,10 +19,10 @@ #include "cutlass/gemm/gemm.h" #include "moe_gemm_kernels.h" #include "tensorrt_llm/common/assert.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaUtils.h" #include "tensorrt_llm/common/quantization.h" #include "tensorrt_llm/kernels/cutlass_kernels/fp8_blockscale_gemm/fp8_blockscale_gemm.h" -#include #ifdef ENABLE_FP4 #include #endif @@ -34,7 +34,9 @@ #include #include -namespace tensorrt_llm::kernels +TRTLLM_NAMESPACE_BEGIN + +namespace kernels { static inline size_t pad_to_multiple_of_16(size_t const& input) @@ -425,9 +427,9 @@ public: virtual void runMoe(void const* input_activations, void const* input_sf, int const* token_selected_experts, float const* token_final_scales, void const* fc1_expert_weights, void const* fc1_expert_biases, ActivationParams fc1_activation_type, void const* fc2_expert_weights, void const* fc2_expert_biases, - QuantParams quant_params, int64_t const num_rows, int64_t const num_valid_rows, int64_t const hidden_size, - int64_t const inter_size, int const num_experts, int const experts_per_token, char* workspace_ptr, - void* final_output, int* unpermuted_row_to_permuted_row, MOEParallelismConfig parallelism_config, bool use_lora, + QuantParams quant_params, int64_t const num_rows, int64_t const hidden_size, int64_t const inter_size, + int const num_experts, int const experts_per_token, char* workspace_ptr, void* final_output, + int* expanded_source_row_to_expanded_dest_row, MOEParallelismConfig parallelism_config, bool use_lora, LoraParams& lora_params, bool use_deepseek_fp8_block_scale, bool min_latency_mode, MoeMinLatencyParams& min_latency_params, cudaStream_t stream) = 0; @@ -439,11 +441,11 @@ public: int64_t const* const num_valid_tokens_ptr, void const* const fc1_int_scales, float const* const fc1_fp8_dequant, float const* const fc2_fp8_quant, TmaWarpSpecializedGroupedGemmInput::ElementSF const* fc1_fp4_act_flat, TmaWarpSpecializedGroupedGemmInput::ElementSF* fc2_fp4_act_flat, QuantParams quant_params, - int64_t const num_rows, int64_t const expanded_num_rows, int64_t const expected_tokens_per_expert, - int64_t const hidden_size, int64_t const inter_size, int const num_experts_per_node, - ActivationParams fc1_activation_type, float const** alpha_scale_ptr_array, bool bias_is_broadcast, - bool use_deepseek_fp8_block_scale, cudaStream_t stream, cutlass_extensions::CutlassGemmConfig config, - bool min_latency_mode, int* num_active_experts_per, int* active_expert_global_ids, int start_expert) + int64_t const num_rows, int64_t const expanded_num_rows, int64_t const hidden_size, int64_t const inter_size, + int const num_experts_per_node, ActivationParams fc1_activation_type, float const** alpha_scale_ptr_array, + bool bias_is_broadcast, bool use_deepseek_fp8_block_scale, cudaStream_t stream, + cutlass_extensions::CutlassGemmConfig config, bool min_latency_mode, int* num_active_experts_per, + int* active_expert_global_ids, int start_expert) = 0; virtual void gemm2(void const* const input, void* const gemm_output, void* const final_output, @@ -451,14 +453,14 @@ public: void const* const fc2_expert_weights, void const* const fc2_expert_biases, void const* const fc2_int_scales, float const* const fc2_fp8_dequant, TmaWarpSpecializedGroupedGemmInput::ElementSF const* fc2_fp4_act_flat, QuantParams quant_params, float const* const token_topk_unpermuted_scales, - float const* const token_topk_permuted_scales, int const* const unpermuted_row_to_permuted_row, - int const* permuted_row_to_unpermuted_row, int const* const expert_for_source_row, + float const* const token_topk_permuted_scales, int const* const expanded_source_row_to_expanded_dest_row, + int const* expanded_dest_row_to_expanded_source_row, int const* const expert_for_source_row, int64_t const* const num_valid_tokens_ptr, int64_t const num_rows, int64_t const expanded_num_rows, - int64_t const expected_tokens_per_expert, int64_t const hidden_size, int64_t const inter_size, - int const num_experts_per_node, int64_t const experts_per_token, float const** alpha_scale_ptr_array, - bool use_lora, void* fc2_lora, bool use_deepseek_fp8_block_scale, cudaStream_t stream, - MOEParallelismConfig parallelism_config, cutlass_extensions::CutlassGemmConfig config, bool min_latency_mode, - int* num_active_experts_per, int* active_expert_global_ids, int start_expert) + int64_t const hidden_size, int64_t const inter_size, int const num_experts_per_node, + int64_t const experts_per_token, float const** alpha_scale_ptr_array, bool use_lora, void* fc2_lora, + bool use_deepseek_fp8_block_scale, cudaStream_t stream, MOEParallelismConfig parallelism_config, + cutlass_extensions::CutlassGemmConfig config, bool min_latency_mode, int* num_active_experts_per, + int* active_expert_global_ids, int start_expert) = 0; virtual std::pair @@ -470,7 +472,7 @@ public: TmaWarpSpecializedGroupedGemmInput::ElementSF const* fp4_act_flat1, TmaWarpSpecializedGroupedGemmInput::ElementSF const* fp4_act_flat2, QuantParams quant_params, void const* bias1, void const* bias2, void* gemm1_output, void* gemm2_output, float const* router_scales, - int const* permuted_row_to_unpermuted_row, cudaStream_t stream) + int const* expanded_dest_row_to_expanded_source_row, cudaStream_t stream) = 0; virtual std::pair @@ -573,9 +575,9 @@ public: void runMoe(void const* input_activations, void const* input_sf, int const* token_selected_experts, float const* token_final_scales, void const* fc1_expert_weights, void const* fc1_expert_biases, ActivationParams fc1_activation_type, void const* fc2_expert_weights, void const* fc2_expert_biases, - QuantParams quant_params, int64_t const num_rows, int64_t const num_valid_rows, int64_t const hidden_size, - int64_t const inter_size, int const num_experts, int const experts_per_token, char* workspace_ptr, - void* final_output, int* unpermuted_row_to_permuted_row, MOEParallelismConfig parallelism_config, bool use_lora, + QuantParams quant_params, int64_t const num_rows, int64_t const hidden_size, int64_t const inter_size, + int const num_experts, int const experts_per_token, char* workspace_ptr, void* final_output, + int* expanded_source_row_to_expanded_dest_row, MOEParallelismConfig parallelism_config, bool use_lora, LoraParams& lora_params, bool use_deepseek_fp8_block_scale, bool min_latency_mode, MoeMinLatencyParams& min_latency_params, cudaStream_t stream) override; @@ -593,11 +595,10 @@ public: ScaleBiasType const* const fc1_int_scales, float const* const fc1_fp8_dequant, float const* const fc2_fp8_quant, TmaWarpSpecializedGroupedGemmInput::ElementSF const* fc1_fp4_act_flat, TmaWarpSpecializedGroupedGemmInput::ElementSF* fc2_fp4_act_flat, QuantParams quant_params, - int64_t const num_rows, int64_t const expanded_num_rows, int64_t const expected_tokens_per_expert, - int64_t const hidden_size, int64_t const inter_size, int const num_experts_per_node, - ActivationParams fc1_activation_type, float const** alpha_scale_ptr_array, bool bias_is_broadcast, - cudaStream_t stream, cutlass_extensions::CutlassGemmConfig config, bool min_latency_mode, - int* num_active_experts_per, int* active_expert_global_ids, int start_expert); + int64_t const num_rows, int64_t const expanded_num_rows, int64_t const hidden_size, int64_t const inter_size, + int const num_experts_per_node, ActivationParams fc1_activation_type, float const** alpha_scale_ptr_array, + bool bias_is_broadcast, cudaStream_t stream, cutlass_extensions::CutlassGemmConfig config, + bool min_latency_mode, int* num_active_experts_per, int* active_expert_global_ids, int start_expert); static void gemm2(MoeGemmRunner& gemm_runner, DeepSeekBlockScaleGemmRunner* fp8_blockscale_gemm_runner, T const* const input, void* const gemm_output, @@ -606,14 +607,13 @@ public: ScaleBiasType const* const fc2_expert_biases, ScaleBiasType const* const fc2_int_scales, float const* const fc2_fp8_dequant, TmaWarpSpecializedGroupedGemmInput::ElementSF const* fc2_fp4_act_flat, QuantParams quant_params, float const* const token_topk_unpermuted_scales, - float const* const token_topk_permuted_scales, int const* const unpermuted_row_to_permuted_row, - int const* permuted_row_to_unpermuted_row, int const* const expert_for_source_row, + float const* const token_topk_permuted_scales, int const* const expanded_source_row_to_expanded_dest_row, + int const* expanded_dest_row_to_expanded_source_row, int const* const expert_for_source_row, int64_t const* const num_valid_tokens_ptr, int64_t const num_rows, int64_t const expanded_num_rows, - int64_t const expected_tokens_per_expert, int64_t const hidden_size, int64_t const inter_size, - int const num_experts_per_node, int64_t const experts_per_token, float const** alpha_scale_ptr_array, - bool use_lora, void* fc2_lora, cudaStream_t stream, MOEParallelismConfig parallelism_config, - cutlass_extensions::CutlassGemmConfig config, bool min_latency_mode, int* num_active_experts_per, - int* active_expert_global_ids, int start_expert); + int64_t const hidden_size, int64_t const inter_size, int const num_experts_per_node, + int64_t const experts_per_token, float const** alpha_scale_ptr_array, bool use_lora, void* fc2_lora, + cudaStream_t stream, MOEParallelismConfig parallelism_config, cutlass_extensions::CutlassGemmConfig config, + bool min_latency_mode, int* num_active_experts_per, int* active_expert_global_ids, int start_expert); // Overrides to allow us to forward on to the internal functions with the pointers using the correct type void gemm1(void const* const input, void* const output, void* const intermediate_result, @@ -622,21 +622,20 @@ public: int64_t const* const num_valid_tokens_ptr, void const* const fc1_int_scales, float const* const fc1_fp8_dequant, float const* const fc2_fp8_quant, TmaWarpSpecializedGroupedGemmInput::ElementSF const* fc1_fp4_act_flat, TmaWarpSpecializedGroupedGemmInput::ElementSF* fc2_fp4_act_flat, QuantParams quant_params, - int64_t const num_rows, int64_t const expanded_num_rows, int64_t const expected_tokens_per_expert, - int64_t const hidden_size, int64_t const inter_size, int const num_experts_per_node, - ActivationParams fc1_activation_type, float const** alpha_scale_ptr_array, bool bias_is_broadcast, - bool use_deepseek_fp8_block_scale, cudaStream_t stream, cutlass_extensions::CutlassGemmConfig config, - bool min_latency_mode, int* num_active_experts_per, int* active_expert_global_ids, int start_expert) override + int64_t const num_rows, int64_t const expanded_num_rows, int64_t const hidden_size, int64_t const inter_size, + int const num_experts_per_node, ActivationParams fc1_activation_type, float const** alpha_scale_ptr_array, + bool bias_is_broadcast, bool use_deepseek_fp8_block_scale, cudaStream_t stream, + cutlass_extensions::CutlassGemmConfig config, bool min_latency_mode, int* num_active_experts_per, + int* active_expert_global_ids, int start_expert) override { auto* block_scale_gemm_runner = use_deepseek_fp8_block_scale ? getDeepSeekBlockScaleGemmRunner() : nullptr; return Self::gemm1(moe_gemm_runner_, block_scale_gemm_runner, static_cast(input), static_cast(output), intermediate_result, expert_first_token_offset, tma_ws_input_template, static_cast(fc1_expert_weights), static_cast(fc1_expert_biases), num_valid_tokens_ptr, static_cast(fc1_int_scales), fc1_fp8_dequant, fc2_fp8_quant, - fc1_fp4_act_flat, fc2_fp4_act_flat, quant_params, num_rows, expanded_num_rows, expected_tokens_per_expert, - hidden_size, inter_size, num_experts_per_node, fc1_activation_type, alpha_scale_ptr_array, - bias_is_broadcast, stream, config, min_latency_mode, num_active_experts_per, active_expert_global_ids, - start_expert); + fc1_fp4_act_flat, fc2_fp4_act_flat, quant_params, num_rows, expanded_num_rows, hidden_size, inter_size, + num_experts_per_node, fc1_activation_type, alpha_scale_ptr_array, bias_is_broadcast, stream, config, + min_latency_mode, num_active_experts_per, active_expert_global_ids, start_expert); } void gemm2(void const* const input, void* const gemm_output, void* const final_output, @@ -644,25 +643,25 @@ public: void const* const fc2_expert_weights, void const* const fc2_expert_biases, void const* const fc2_int_scales, float const* const fc2_fp8_dequant, TmaWarpSpecializedGroupedGemmInput::ElementSF const* fc2_fp4_act_flat, QuantParams quant_params, float const* const token_topk_unpermuted_scales, - float const* const token_topk_permuted_scales, int const* const unpermuted_row_to_permuted_row, - int const* permuted_row_to_unpermuted_row, int const* const expert_for_source_row, + float const* const token_topk_permuted_scales, int const* const expanded_source_row_to_expanded_dest_row, + int const* expanded_dest_row_to_expanded_source_row, int const* const expert_for_source_row, int64_t const* const num_valid_tokens_ptr, int64_t const num_rows, int64_t const expanded_num_rows, - int64_t const expected_tokens_per_expert, int64_t const hidden_size, int64_t const inter_size, - int const num_experts_per_node, int64_t const experts_per_token, float const** alpha_scale_ptr_array, - bool use_lora, void* fc2_lora, bool use_deepseek_fp8_block_scale, cudaStream_t stream, - MOEParallelismConfig parallelism_config, cutlass_extensions::CutlassGemmConfig config, bool min_latency_mode, - int* num_active_experts_per, int* active_expert_global_ids, int start_expert) override + int64_t const hidden_size, int64_t const inter_size, int const num_experts_per_node, + int64_t const experts_per_token, float const** alpha_scale_ptr_array, bool use_lora, void* fc2_lora, + bool use_deepseek_fp8_block_scale, cudaStream_t stream, MOEParallelismConfig parallelism_config, + cutlass_extensions::CutlassGemmConfig config, bool min_latency_mode, int* num_active_experts_per, + int* active_expert_global_ids, int start_expert) override { auto* block_scale_gemm_runner = use_deepseek_fp8_block_scale ? getDeepSeekBlockScaleGemmRunner() : nullptr; return Self::gemm2(moe_gemm_runner_, block_scale_gemm_runner, static_cast(input), gemm_output, static_cast(final_output), expert_first_token_offset, tma_ws_input_template, static_cast(fc2_expert_weights), static_cast(fc2_expert_biases), static_cast(fc2_int_scales), fc2_fp8_dequant, fc2_fp4_act_flat, quant_params, - token_topk_unpermuted_scales, token_topk_permuted_scales, unpermuted_row_to_permuted_row, - permuted_row_to_unpermuted_row, expert_for_source_row, num_valid_tokens_ptr, num_rows, expanded_num_rows, - expected_tokens_per_expert, hidden_size, inter_size, num_experts_per_node, experts_per_token, - alpha_scale_ptr_array, use_lora, fc2_lora, stream, parallelism_config, config, min_latency_mode, - num_active_experts_per, active_expert_global_ids, start_expert); + token_topk_unpermuted_scales, token_topk_permuted_scales, expanded_source_row_to_expanded_dest_row, + expanded_dest_row_to_expanded_source_row, expert_for_source_row, num_valid_tokens_ptr, num_rows, + expanded_num_rows, hidden_size, inter_size, num_experts_per_node, experts_per_token, alpha_scale_ptr_array, + use_lora, fc2_lora, stream, parallelism_config, config, min_latency_mode, num_active_experts_per, + active_expert_global_ids, start_expert); } virtual size_t getGemmWorkspaceSize(int num_experts_per_node) const override @@ -679,7 +678,7 @@ public: TmaWarpSpecializedGroupedGemmInput::ElementSF const* fp4_act_flat1, TmaWarpSpecializedGroupedGemmInput::ElementSF const* fp4_act_flat2, QuantParams quant_params, void const* bias1, void const* bias2, void* gemm1_output, void* gemm2_output, float const* router_scales, - int const* permuted_row_to_unpermuted_row, cudaStream_t stream) override + int const* expanded_dest_row_to_expanded_source_row, cudaStream_t stream) override { return Self::computeStridesTmaWarpSpecialized(expert_first_token_offset, layout_info1, layout_info2, num_tokens, expanded_num_tokens, gemm1_n, gemm1_k, gemm2_n, gemm2_k, num_experts_per_node, @@ -688,8 +687,8 @@ public: alpha_scale_flat1, alpha_scale_flat2, fp4_act_flat1, fp4_act_flat2, quant_params, reinterpret_cast(bias1), reinterpret_cast(bias2), reinterpret_cast(gemm1_output), - reinterpret_cast(gemm2_output), router_scales, permuted_row_to_unpermuted_row, - stream); + reinterpret_cast(gemm2_output), router_scales, + expanded_dest_row_to_expanded_source_row, stream); } std::pair @@ -731,8 +730,8 @@ private: float const* alpha_scale_flat2, TmaWarpSpecializedGroupedGemmInput::ElementSF const* fp4_act_flat1, TmaWarpSpecializedGroupedGemmInput::ElementSF const* fp4_act_flat2, QuantParams quant_params, ScaleBiasType const* bias1, ScaleBiasType const* bias2, UnfusedGemmOutputType* gemm1_output, - UnfusedGemmOutputType* gemm2_output, float const* router_scales, int const* permuted_row_to_unpermuted_row, - cudaStream_t stream); + UnfusedGemmOutputType* gemm2_output, float const* router_scales, + int const* expanded_dest_row_to_expanded_source_row, cudaStream_t stream); static std::pair computeStridesTmaWarpSpecializedLowLatency(TmaWarpSpecializedGroupedGemmInput layout_info1, TmaWarpSpecializedGroupedGemmInput layout_info2, int64_t num_tokens, int64_t gemm1_n, int64_t gemm1_k, @@ -793,18 +792,17 @@ private: void* const intermediate_result, int64_t const* const expert_first_token_offset, WeightType const* const fc1_expert_weights, ScaleBiasType const* const fc1_expert_biases, float const* const fc2_fp8_quant, int64_t const num_rows, int64_t const expanded_num_rows, - int64_t const expected_tokens_per_expert, int64_t const hidden_size, int64_t const inter_size, - int const num_experts_per_node, ActivationParams fc1_activation_type, QuantParams& quant_params, - cudaStream_t stream); + int64_t const hidden_size, int64_t const inter_size, int const num_experts_per_node, + ActivationParams fc1_activation_type, QuantParams& quant_params, cudaStream_t stream); static void BlockScaleFC2(DeepSeekBlockScaleGemmRunner& gemm_runner, T const* const input, void* const gemm_output, OutputType* const final_output, int64_t const* const expert_first_token_offset, WeightType const* const fc2_expert_weights, ScaleBiasType const* const fc2_expert_biases, - float const* const token_topk_unpermuted_scales, int const* const unpermuted_row_to_permuted_row, + float const* const token_topk_unpermuted_scales, int const* const expanded_source_row_to_expanded_dest_row, int const* const expert_for_source_row, int64_t const* const num_valid_tokens_ptr, int64_t const num_rows, - int64_t const expanded_num_rows, int64_t const expected_tokens_per_expert, int64_t const hidden_size, - int64_t const unpadded_hidden_size, int64_t const inter_size, int const num_experts_per_node, int64_t const k, - MOEParallelismConfig parallelism_config, QuantParams& quant_params, cudaStream_t stream); + int64_t const expanded_num_rows, int64_t const hidden_size, int64_t const inter_size, + int const num_experts_per_node, int64_t const k, MOEParallelismConfig parallelism_config, + QuantParams& quant_params, cudaStream_t stream); T const* applyPrequantScale(void* smoothed_act, void const* permuted_data, void const* prequant_scales, int64_t const* num_valid_tokens_ptr, int64_t const expanded_num_rows, int64_t const seq_len, bool const use_awq, @@ -960,4 +958,6 @@ private: // Populates a buffer with random values for use with MOE benchmarking void populateRandomBuffer(void* buffer_void, size_t size, cudaStream_t stream); -} // namespace tensorrt_llm::kernels +} // namespace kernels + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/x86_64-linux-gnu/tensorrt_llm_internal_cutlass_kernels_static.tar.xz b/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/x86_64-linux-gnu/tensorrt_llm_internal_cutlass_kernels_static.tar.xz index 769039f568..935aabe42d 100644 --- a/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/x86_64-linux-gnu/tensorrt_llm_internal_cutlass_kernels_static.tar.xz +++ b/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/x86_64-linux-gnu/tensorrt_llm_internal_cutlass_kernels_static.tar.xz @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e4c70e6e756b7c4efb0abcd0156e38d10481e9493e48fd140f9efcd1cdda68a3 -size 66889324 +oid sha256:d74cbe0df4f798fbc0c157280ebcc734ad6d1897ba3b43026e4aa22a2a4480a5 +size 66904288 diff --git a/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/x86_64-linux-gnu/version.txt b/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/x86_64-linux-gnu/version.txt index b37609a070..4194d5219e 100644 --- a/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/x86_64-linux-gnu/version.txt +++ b/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/x86_64-linux-gnu/version.txt @@ -1,2 +1,2 @@ -9db5ce2be51af2d4bd983af497ac9dbe53d8c57284d7ba455babd95c202db7d4 libtensorrt_llm_internal_cutlass_kernels_static.a -commit 33f251e0599197ad3e6c59d64a42f9721d3cc27c +a396f947f273fc752469160c9ae83caf393017d096cf4881ee09ad6af64296e1 libtensorrt_llm_internal_cutlass_kernels_static.a +commit 9fc66c405c7caaaeb65542ba1498f00d863f0a4a diff --git a/cpp/tensorrt_llm/kernels/kvCachePartialCopy.cu b/cpp/tensorrt_llm/kernels/kvCachePartialCopy.cu index e5675172ac..3b91cf3f17 100644 --- a/cpp/tensorrt_llm/kernels/kvCachePartialCopy.cu +++ b/cpp/tensorrt_llm/kernels/kvCachePartialCopy.cu @@ -14,12 +14,13 @@ * limitations under the License. */ +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/kernels/kvCachePartialCopy.h" #include #include -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { namespace @@ -133,4 +134,5 @@ void kvCacheBlockPartialCopy(IBuffer& dst, IBuffer const& src, unsigned int numL } } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/kvCacheUtils.h b/cpp/tensorrt_llm/kernels/kvCacheUtils.h index 065c2e7b70..166f476112 100644 --- a/cpp/tensorrt_llm/kernels/kvCacheUtils.h +++ b/cpp/tensorrt_llm/kernels/kvCacheUtils.h @@ -16,6 +16,7 @@ #pragma once #include "tensorrt_llm/common/assert.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/kernels/kvCacheIndex.h" #include @@ -24,7 +25,9 @@ #include #include -namespace tensorrt_llm::kernels +TRTLLM_NAMESPACE_BEGIN + +namespace kernels { // Internal for K and V cache indexing @@ -38,7 +41,7 @@ enum class KVIdxType : int32_t // only the fields necessary for context FMHA struct KVBlockArrayForContextFMHA { - using DataType = KVCacheIndex const; + using DataType = ::tensorrt_llm::kernels::KVCacheIndex const; // The maximum number of sequences supported by the kv-cache. int32_t mMaxSeqs; @@ -322,4 +325,6 @@ struct KVLinearBuffer } }; -} // namespace tensorrt_llm::kernels +} // namespace kernels + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/layernormKernels.cu b/cpp/tensorrt_llm/kernels/layernormKernels.cu index e7943d04c2..f8dbd9343e 100644 --- a/cpp/tensorrt_llm/kernels/layernormKernels.cu +++ b/cpp/tensorrt_llm/kernels/layernormKernels.cu @@ -14,6 +14,7 @@ * limitations under the License. */ +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaTypeUtils.cuh" #include "tensorrt_llm/common/quantTypeUtils.cuh" #include "tensorrt_llm/common/reduceKernelUtils.cuh" @@ -21,8 +22,8 @@ using namespace tensorrt_llm::common; -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { @@ -340,4 +341,5 @@ INSTANTIATE_GENERAL_LAYERNORM(__nv_bfloat16, __nv_fp8_e4m3); #endif } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/layernormKernels.h b/cpp/tensorrt_llm/kernels/layernormKernels.h index d2e7335e03..08581713d9 100644 --- a/cpp/tensorrt_llm/kernels/layernormKernels.h +++ b/cpp/tensorrt_llm/kernels/layernormKernels.h @@ -16,14 +16,15 @@ #pragma once +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaUtils.h" #include "tensorrt_llm/common/quantization.h" #include #include #include -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { @@ -34,4 +35,5 @@ void invokeGeneralLayerNorm(T* out, T const* input, T const* gamma, T const* bet float* dynamic_scale = nullptr, float* sum_per_token = nullptr, QuantT* out_quant = nullptr); } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/llama4MinLatencyKernels/llama4Bf16Bf16Gemm.cu b/cpp/tensorrt_llm/kernels/llama4MinLatencyKernels/llama4Bf16Bf16Gemm.cu index a43c8cfd32..7bdf7f593a 100644 --- a/cpp/tensorrt_llm/kernels/llama4MinLatencyKernels/llama4Bf16Bf16Gemm.cu +++ b/cpp/tensorrt_llm/kernels/llama4MinLatencyKernels/llama4Bf16Bf16Gemm.cu @@ -14,10 +14,13 @@ * limitations under the License. */ +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/kernels/llama4MinLatencyKernels/llama4Bf16Bf16Gemm.h" #include "tensorrt_llm/kernels/llama4MinLatencyKernels/llama4Utils.cuh" -namespace tensorrt_llm::kernels::llama4_min_latency::llama4_bf16_bf16_gemm +TRTLLM_NAMESPACE_BEGIN + +namespace kernels::llama4_min_latency::llama4_bf16_bf16_gemm { struct __align__(8) aligned_bf16x4 @@ -125,4 +128,6 @@ void llama4_bf16_bf16_gemm_op(int num_tokens, void const* A, void const* B, void llama4_bf16_bf16_gemm_launcher(num_tokens, A_bf16, B_bf16, C_bf16, stream); } -} // namespace tensorrt_llm::kernels::llama4_min_latency::llama4_bf16_bf16_gemm +} // namespace kernels::llama4_min_latency::llama4_bf16_bf16_gemm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/llama4MinLatencyKernels/llama4Bf16Bf16Gemm.h b/cpp/tensorrt_llm/kernels/llama4MinLatencyKernels/llama4Bf16Bf16Gemm.h index 18104f2a2b..a9d079a7cb 100644 --- a/cpp/tensorrt_llm/kernels/llama4MinLatencyKernels/llama4Bf16Bf16Gemm.h +++ b/cpp/tensorrt_llm/kernels/llama4MinLatencyKernels/llama4Bf16Bf16Gemm.h @@ -15,13 +15,18 @@ */ #pragma once +#include "tensorrt_llm/common/config.h" #include #include #include -namespace tensorrt_llm::kernels::llama4_min_latency::llama4_bf16_bf16_gemm +TRTLLM_NAMESPACE_BEGIN + +namespace kernels::llama4_min_latency::llama4_bf16_bf16_gemm { void llama4_bf16_bf16_gemm_op(int num_tokens, void const* A, void const* B, void* C, cudaStream_t stream); -} // namespace tensorrt_llm::kernels::llama4_min_latency::llama4_bf16_bf16_gemm +} // namespace kernels::llama4_min_latency::llama4_bf16_bf16_gemm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/llama4MinLatencyKernels/llama4Fp8Bf16Gemm.cu b/cpp/tensorrt_llm/kernels/llama4MinLatencyKernels/llama4Fp8Bf16Gemm.cu index aa54651f0d..53efc2d24a 100644 --- a/cpp/tensorrt_llm/kernels/llama4MinLatencyKernels/llama4Fp8Bf16Gemm.cu +++ b/cpp/tensorrt_llm/kernels/llama4MinLatencyKernels/llama4Fp8Bf16Gemm.cu @@ -14,6 +14,7 @@ * limitations under the License. */ +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/kernels/llama4MinLatencyKernels/llama4Fp8Bf16Gemm.h" #include "tensorrt_llm/kernels/llama4MinLatencyKernels/llama4Fp8Bf16GemmAttnScalingPerBlockTemplate.cuh" #include "tensorrt_llm/kernels/llama4MinLatencyKernels/llama4Fp8Bf16GemmPerBlockTemplate.cuh" @@ -21,7 +22,9 @@ #include "tensorrt_llm/kernels/llama4MinLatencyKernels/llama4Utils.cuh" #include -namespace tensorrt_llm::kernels::llama4_min_latency::llama4_fp8_bf16_gemm +TRTLLM_NAMESPACE_BEGIN + +namespace kernels::llama4_min_latency::llama4_fp8_bf16_gemm { DEFINE_GET_PER_BLOCK_FUNC_PTR(/*HIDDEN_IN=*/5120, /*ALIGNED=*/true); @@ -186,4 +189,6 @@ void llama4_fp8_bf16_gemm_op(void const* A, void const* B, void* C, void const* } } -} // namespace tensorrt_llm::kernels::llama4_min_latency::llama4_fp8_bf16_gemm +} // namespace kernels::llama4_min_latency::llama4_fp8_bf16_gemm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/llama4MinLatencyKernels/llama4Fp8Bf16Gemm.h b/cpp/tensorrt_llm/kernels/llama4MinLatencyKernels/llama4Fp8Bf16Gemm.h index 709d56d3bf..35297bde38 100644 --- a/cpp/tensorrt_llm/kernels/llama4MinLatencyKernels/llama4Fp8Bf16Gemm.h +++ b/cpp/tensorrt_llm/kernels/llama4MinLatencyKernels/llama4Fp8Bf16Gemm.h @@ -16,15 +16,20 @@ #pragma once +#include "tensorrt_llm/common/config.h" #include #include #include #include -namespace tensorrt_llm::kernels::llama4_min_latency::llama4_fp8_bf16_gemm +TRTLLM_NAMESPACE_BEGIN + +namespace kernels::llama4_min_latency::llama4_fp8_bf16_gemm { void llama4_fp8_bf16_gemm_op(void const* A, void const* B, void* C, void const* scaling_factor, void const* pos_ids, bool pos_ids_int64, int num_tokens, int hidden_in, int hidden_out, cudaStream_t stream); -} // namespace tensorrt_llm::kernels::llama4_min_latency::llama4_fp8_bf16_gemm +} // namespace kernels::llama4_min_latency::llama4_fp8_bf16_gemm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/llama4MinLatencyKernels/llama4Fp8Bf16GemmAttnScalingPerBlockTemplate.cuh b/cpp/tensorrt_llm/kernels/llama4MinLatencyKernels/llama4Fp8Bf16GemmAttnScalingPerBlockTemplate.cuh index b330908d09..56ed6e4b0d 100644 --- a/cpp/tensorrt_llm/kernels/llama4MinLatencyKernels/llama4Fp8Bf16GemmAttnScalingPerBlockTemplate.cuh +++ b/cpp/tensorrt_llm/kernels/llama4MinLatencyKernels/llama4Fp8Bf16GemmAttnScalingPerBlockTemplate.cuh @@ -16,13 +16,16 @@ #pragma once +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/kernels/llama4MinLatencyKernels/llama4Utils.cuh" #include #include #include -namespace tensorrt_llm::kernels::llama4_min_latency::llama4_fp8_bf16_gemm +TRTLLM_NAMESPACE_BEGIN + +namespace kernels::llama4_min_latency::llama4_fp8_bf16_gemm { // Grid size is num_tokens / TILE_TOKEN * hidden_out / TILE_OUT. @@ -357,4 +360,6 @@ __launch_bounds__(BLOCK_SIZE) __global__ void llama4_fp8_bf16_gemm_attn_scaling_ DISPATCH_PER_BLOCK_FC_FP8_BF16_ATTN_SCALING_TILE_OUT(HIDDEN_IN, tile_token, tile_out, ALIGNED, POS_IDS_INT64); \ } -} // namespace tensorrt_llm::kernels::llama4_min_latency::llama4_fp8_bf16_gemm +} // namespace kernels::llama4_min_latency::llama4_fp8_bf16_gemm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/llama4MinLatencyKernels/llama4Fp8Bf16GemmPerBlockTemplate.cuh b/cpp/tensorrt_llm/kernels/llama4MinLatencyKernels/llama4Fp8Bf16GemmPerBlockTemplate.cuh index eac5a41399..618a0aea0b 100644 --- a/cpp/tensorrt_llm/kernels/llama4MinLatencyKernels/llama4Fp8Bf16GemmPerBlockTemplate.cuh +++ b/cpp/tensorrt_llm/kernels/llama4MinLatencyKernels/llama4Fp8Bf16GemmPerBlockTemplate.cuh @@ -16,13 +16,16 @@ #pragma once +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/kernels/llama4MinLatencyKernels/llama4Utils.cuh" #include #include #include -namespace tensorrt_llm::kernels::llama4_min_latency::llama4_fp8_bf16_gemm +TRTLLM_NAMESPACE_BEGIN + +namespace kernels::llama4_min_latency::llama4_fp8_bf16_gemm { // Grid size is num_tokens / TILE_TOKEN * hidden_out / TILE_OUT. @@ -297,4 +300,6 @@ __launch_bounds__(BLOCK_SIZE) __global__ void llama4_fp8_bf16_gemm_per_block_ker DISPATCH_PER_BLOCK_FC_FP8_BF16_TILE_OUT(HIDDEN_IN, tile_token, tile_out, ALIGNED); \ } -} // namespace tensorrt_llm::kernels::llama4_min_latency::llama4_fp8_bf16_gemm +} // namespace kernels::llama4_min_latency::llama4_fp8_bf16_gemm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/llama4MinLatencyKernels/llama4Fp8Bf16GemmPerWarpTemplate.cuh b/cpp/tensorrt_llm/kernels/llama4MinLatencyKernels/llama4Fp8Bf16GemmPerWarpTemplate.cuh index 592995dc4a..2172acde74 100644 --- a/cpp/tensorrt_llm/kernels/llama4MinLatencyKernels/llama4Fp8Bf16GemmPerWarpTemplate.cuh +++ b/cpp/tensorrt_llm/kernels/llama4MinLatencyKernels/llama4Fp8Bf16GemmPerWarpTemplate.cuh @@ -16,13 +16,16 @@ #pragma once +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/kernels/llama4MinLatencyKernels/llama4Utils.cuh" #include #include #include -namespace tensorrt_llm::kernels::llama4_min_latency::llama4_fp8_bf16_gemm +TRTLLM_NAMESPACE_BEGIN + +namespace kernels::llama4_min_latency::llama4_fp8_bf16_gemm { // Grid size is num_tokens / TILE_TOKEN * hidden_out / TILE_OUT / WARP_PER_BLOCK. @@ -323,4 +326,6 @@ __launch_bounds__(BLOCK_SIZE) __global__ void llama4_fp8_bf16_gemm_per_warp_kern DISPATCH_PER_WARP_FC_FP8_BF16_TILE_OUT(HIDDEN_IN, tile_token, tile_out, ALIGNED); \ } -} // namespace tensorrt_llm::kernels::llama4_min_latency::llama4_fp8_bf16_gemm +} // namespace kernels::llama4_min_latency::llama4_fp8_bf16_gemm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/llama4MinLatencyKernels/llama4Fp8Fp8GemmSwiGLU.cu b/cpp/tensorrt_llm/kernels/llama4MinLatencyKernels/llama4Fp8Fp8GemmSwiGLU.cu index 9f7b897043..6b0c988383 100644 --- a/cpp/tensorrt_llm/kernels/llama4MinLatencyKernels/llama4Fp8Fp8GemmSwiGLU.cu +++ b/cpp/tensorrt_llm/kernels/llama4MinLatencyKernels/llama4Fp8Fp8GemmSwiGLU.cu @@ -14,13 +14,16 @@ * limitations under the License. */ +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/kernels/llama4MinLatencyKernels/llama4Fp8Fp8GemmSwiGLUPerBlockTemplate.cuh" #include "tensorrt_llm/kernels/llama4MinLatencyKernels/llama4Utils.cuh" #include #include -namespace tensorrt_llm::kernels::llama4_min_latency::llama4_fp8_fp8_gemm_swiglu +TRTLLM_NAMESPACE_BEGIN + +namespace kernels::llama4_min_latency::llama4_fp8_fp8_gemm_swiglu { DEFINE_GET_FUNC_PTR(5120, true); @@ -236,4 +239,6 @@ void llama4_fp8_fp8_gemm_swiglu_op(int num_tokens, int hidden_in, int hidden_out A, B, C, in_scale, out_scale_inv, num_tokens, hidden_in, hidden_out, tactic.first, tactic.second, stream); } -} // namespace tensorrt_llm::kernels::llama4_min_latency::llama4_fp8_fp8_gemm_swiglu +} // namespace kernels::llama4_min_latency::llama4_fp8_fp8_gemm_swiglu + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/llama4MinLatencyKernels/llama4Fp8Fp8GemmSwiGLU.h b/cpp/tensorrt_llm/kernels/llama4MinLatencyKernels/llama4Fp8Fp8GemmSwiGLU.h index aa11c4485d..f202578301 100644 --- a/cpp/tensorrt_llm/kernels/llama4MinLatencyKernels/llama4Fp8Fp8GemmSwiGLU.h +++ b/cpp/tensorrt_llm/kernels/llama4MinLatencyKernels/llama4Fp8Fp8GemmSwiGLU.h @@ -16,16 +16,21 @@ #pragma once +#include "tensorrt_llm/common/config.h" #include #include #include #include -namespace tensorrt_llm::kernels::llama4_min_latency::llama4_fp8_fp8_gemm_swiglu +TRTLLM_NAMESPACE_BEGIN + +namespace kernels::llama4_min_latency::llama4_fp8_fp8_gemm_swiglu { void llama4_fp8_fp8_gemm_swiglu_op(int num_tokens, int hidden_in, int hidden_out, void const* A, void const* B, void* C, void const* in_scale, void const* out_scale_inv, cudaStream_t stream); -} // namespace tensorrt_llm::kernels::llama4_min_latency::llama4_fp8_fp8_gemm_swiglu +} // namespace kernels::llama4_min_latency::llama4_fp8_fp8_gemm_swiglu + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/llama4MinLatencyKernels/llama4Fp8Fp8GemmSwiGLUPerBlockTemplate.cuh b/cpp/tensorrt_llm/kernels/llama4MinLatencyKernels/llama4Fp8Fp8GemmSwiGLUPerBlockTemplate.cuh index e0a459656b..d6923c4afd 100644 --- a/cpp/tensorrt_llm/kernels/llama4MinLatencyKernels/llama4Fp8Fp8GemmSwiGLUPerBlockTemplate.cuh +++ b/cpp/tensorrt_llm/kernels/llama4MinLatencyKernels/llama4Fp8Fp8GemmSwiGLUPerBlockTemplate.cuh @@ -16,11 +16,14 @@ #pragma once +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/kernels/llama4MinLatencyKernels/llama4Utils.cuh" #include -namespace tensorrt_llm::kernels::llama4_min_latency::llama4_fp8_fp8_gemm_swiglu +TRTLLM_NAMESPACE_BEGIN + +namespace kernels::llama4_min_latency::llama4_fp8_fp8_gemm_swiglu { // Grid size is num_tokens / TILE_TOKEN * hidden_out / TILE_OUT. @@ -337,4 +340,6 @@ __launch_bounds__(BLOCK_SIZE) __global__ void llama4_fp8_fp8_gemm_swiglu_per_blo DISPATCH_FC_FP8_BF16_TILE_OUT(HIDDEN_IN, tile_token, tile_out, ALIGNED); \ } -} // namespace tensorrt_llm::kernels::llama4_min_latency::llama4_fp8_fp8_gemm_swiglu +} // namespace kernels::llama4_min_latency::llama4_fp8_fp8_gemm_swiglu + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/llama4MinLatencyKernels/llama4MinLatencyMoEOp.cu b/cpp/tensorrt_llm/kernels/llama4MinLatencyKernels/llama4MinLatencyMoEOp.cu index fd4b29fd65..87b8e0d16c 100644 --- a/cpp/tensorrt_llm/kernels/llama4MinLatencyKernels/llama4MinLatencyMoEOp.cu +++ b/cpp/tensorrt_llm/kernels/llama4MinLatencyKernels/llama4MinLatencyMoEOp.cu @@ -14,6 +14,7 @@ * limitations under the License. */ +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/kernels/llama4MinLatencyKernels/llama4MinLatencyMoEOp.h" #include "tensorrt_llm/kernels/llama4MinLatencyKernels/llama4Utils.cuh" #include @@ -33,7 +34,9 @@ #define ENABLE_PREFETCH 1 #define ENABLE_PREEXIT 1 -namespace tensorrt_llm::kernels::llama4_min_latency::llama4_moe +TRTLLM_NAMESPACE_BEGIN + +namespace kernels::llama4_min_latency::llama4_moe { #define TOPK_VEC_SIZE 4 @@ -351,4 +354,6 @@ void run_moe_llama4_tp8ep1_min_latency(int num_tokens, int num_experts, exp_idx, output_void, dequant_fc2, stream); } -} // namespace tensorrt_llm::kernels::llama4_min_latency::llama4_moe +} // namespace kernels::llama4_min_latency::llama4_moe + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/llama4MinLatencyKernels/llama4MinLatencyMoEOp.h b/cpp/tensorrt_llm/kernels/llama4MinLatencyKernels/llama4MinLatencyMoEOp.h index 7d0d52c683..2cac832b39 100644 --- a/cpp/tensorrt_llm/kernels/llama4MinLatencyKernels/llama4MinLatencyMoEOp.h +++ b/cpp/tensorrt_llm/kernels/llama4MinLatencyKernels/llama4MinLatencyMoEOp.h @@ -15,12 +15,15 @@ */ #pragma once +#include "tensorrt_llm/common/config.h" #include #include #include #include -namespace tensorrt_llm::kernels::llama4_min_latency::llama4_moe +TRTLLM_NAMESPACE_BEGIN + +namespace kernels::llama4_min_latency::llama4_moe { // Launch moe_mlp_fc13_swiglu_fp8_5120 and moe_fc_fp8_bf16_1024. @@ -37,4 +40,6 @@ void run_moe_llama4_tp8ep1_min_latency(int num_tokens, int num_experts, void* __restrict__ output_void, // FC2 output tensor BF16 [num_tokens][HIDDEN_SIZE] cudaStream_t stream); -} // namespace tensorrt_llm::kernels::llama4_min_latency::llama4_moe +} // namespace kernels::llama4_min_latency::llama4_moe + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/llama4MinLatencyKernels/llama4Utils.cuh b/cpp/tensorrt_llm/kernels/llama4MinLatencyKernels/llama4Utils.cuh index de5df85da2..0e01146990 100644 --- a/cpp/tensorrt_llm/kernels/llama4MinLatencyKernels/llama4Utils.cuh +++ b/cpp/tensorrt_llm/kernels/llama4MinLatencyKernels/llama4Utils.cuh @@ -16,11 +16,14 @@ #pragma once +#include "tensorrt_llm/common/config.h" #include #include "tensorrt_llm/common/envUtils.h" -namespace tensorrt_llm::kernels::llama4_min_latency +TRTLLM_NAMESPACE_BEGIN + +namespace kernels::llama4_min_latency { namespace llama4_bf16_bf16_gemm @@ -119,4 +122,6 @@ struct __align__(8) aligned_bfloat16x4 __align__(8) __nv_bfloat16 data[4]; }; -} // namespace tensorrt_llm::kernels::llama4_min_latency +} // namespace kernels::llama4_min_latency + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/logitsBitmask.cu b/cpp/tensorrt_llm/kernels/logitsBitmask.cu index 084e660cc7..ac66967e0f 100644 --- a/cpp/tensorrt_llm/kernels/logitsBitmask.cu +++ b/cpp/tensorrt_llm/kernels/logitsBitmask.cu @@ -14,14 +14,15 @@ * limitations under the License. */ +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaUtils.h" #include "tensorrt_llm/kernels/logitsBitmask.h" using namespace tensorrt_llm::common; using namespace tensorrt_llm::runtime; -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { namespace @@ -330,4 +331,5 @@ template void invokeContiguousLogitsBitmask<__nv_bfloat16>(__nv_bfloat16* logits #endif } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/logitsBitmask.h b/cpp/tensorrt_llm/kernels/logitsBitmask.h index 942f8acada..e2e6cb28cd 100644 --- a/cpp/tensorrt_llm/kernels/logitsBitmask.h +++ b/cpp/tensorrt_llm/kernels/logitsBitmask.h @@ -16,12 +16,13 @@ #pragma once +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/runtime/common.h" #include #include -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { @@ -34,4 +35,5 @@ void invokeContiguousLogitsBitmask(T* logits, uint32_t const* bitmask, int32_t c int32_t batchSize, int32_t vocabSizePadded, int32_t bitmaskSize, cudaStream_t stream); } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/lookupKernels.cu b/cpp/tensorrt_llm/kernels/lookupKernels.cu index 1ae2ed8258..f1435bf0d3 100644 --- a/cpp/tensorrt_llm/kernels/lookupKernels.cu +++ b/cpp/tensorrt_llm/kernels/lookupKernels.cu @@ -14,13 +14,14 @@ * limitations under the License. */ +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaTypeUtils.cuh" #include "tensorrt_llm/kernels/lookupKernels.h" using namespace tensorrt_llm::common; -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { /* When running with multiple GPUs, we split the embedding lookup table across multiple GPUs to save the memory @@ -92,4 +93,5 @@ INSTANTIATE_LOOK_UP(__nv_bfloat16, int8_t, int); #endif } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/lookupKernels.h b/cpp/tensorrt_llm/kernels/lookupKernels.h index ac5f3f4a77..9dc5ba4886 100644 --- a/cpp/tensorrt_llm/kernels/lookupKernels.h +++ b/cpp/tensorrt_llm/kernels/lookupKernels.h @@ -16,13 +16,14 @@ #pragma once +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaUtils.h" #include #include #include -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { template @@ -30,4 +31,5 @@ void invokeLookUp(Tout* out, Idx const* input, Tin const* weight, int64_t const Idx const size, Idx const n_embed, Tout const* perTokenScales, cudaStream_t stream = 0); } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/lora/dora.h b/cpp/tensorrt_llm/kernels/lora/dora.h index b8e763f5d3..fc21fe6693 100644 --- a/cpp/tensorrt_llm/kernels/lora/dora.h +++ b/cpp/tensorrt_llm/kernels/lora/dora.h @@ -15,10 +15,13 @@ * limitations under the License. */ +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaUtils.h" #include -namespace tensorrt_llm::kernels +TRTLLM_NAMESPACE_BEGIN + +namespace kernels { class DoraImpl { @@ -40,4 +43,6 @@ private: std::vector mHostBuf; nvinfer1::DataType mType; }; -} // namespace tensorrt_llm::kernels +} // namespace kernels + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/lora/lora.cpp b/cpp/tensorrt_llm/kernels/lora/lora.cpp index 67e774f60c..167826be62 100644 --- a/cpp/tensorrt_llm/kernels/lora/lora.cpp +++ b/cpp/tensorrt_llm/kernels/lora/lora.cpp @@ -15,18 +15,21 @@ * limitations under the License. */ -#include "tensorrt_llm/kernels/lora/lora.h" - #include "tensorrt_llm/common/assert.h" +#include "tensorrt_llm/common/config.h" + #include "tensorrt_llm/common/cudaUtils.h" #include "tensorrt_llm/kernels/groupGemm.h" +#include "tensorrt_llm/kernels/lora/lora.h" #include "tensorrt_llm/kernels/splitkGroupGemm.h" #include "tensorrt_llm/runtime/iBuffer.h" #include #include -namespace tensorrt_llm::kernels +TRTLLM_NAMESPACE_BEGIN + +namespace kernels { // TODO should reuse the function in gemmPlugin @@ -339,4 +342,6 @@ int Lora_run(LoraImpl* impl, int64_t numTokens, int64_t numReqs, void const* inp return impl->run(numTokens, numReqs, input, loraRanks, loraWeightsPtr, weightIndex, outputs, workspace, stream); } -} // namespace tensorrt_llm::kernels +} // namespace kernels + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/lora/lora.h b/cpp/tensorrt_llm/kernels/lora/lora.h index 38437b5348..7215a7af74 100644 --- a/cpp/tensorrt_llm/kernels/lora/lora.h +++ b/cpp/tensorrt_llm/kernels/lora/lora.h @@ -17,13 +17,16 @@ #pragma once +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cublasMMWrapper.h" #include #include #include -namespace tensorrt_llm::kernels +TRTLLM_NAMESPACE_BEGIN + +namespace kernels { using CublasGemmWrapper = tensorrt_llm::common::CublasMMWrapper; @@ -70,4 +73,6 @@ private: int Lora_run(LoraImpl* impl, int64_t numTokens, int64_t numReqs, void const* input, int32_t const* loraRanks, void const* const* loraWeightsPtr, int weightIndex, void* const* outputs, void* workspace, cudaStream_t stream); -} // namespace tensorrt_llm::kernels +} // namespace kernels + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/lruKernel.cu b/cpp/tensorrt_llm/kernels/lruKernel.cu index a0fc4fdb84..731ccb016e 100644 --- a/cpp/tensorrt_llm/kernels/lruKernel.cu +++ b/cpp/tensorrt_llm/kernels/lruKernel.cu @@ -27,12 +27,13 @@ #endif #include "lruKernel.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaTypeUtils.cuh" using namespace tensorrt_llm::common; -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { @@ -437,4 +438,5 @@ INSTANTIATE_RGLRU_UPDATE_DATA_TYPE(__nv_bfloat16); #undef INSTANTIATE_RGLRU_UPDATE_DATA_TYPE } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/lruKernel.h b/cpp/tensorrt_llm/kernels/lruKernel.h index c49f039d48..a0f31bbea5 100644 --- a/cpp/tensorrt_llm/kernels/lruKernel.h +++ b/cpp/tensorrt_llm/kernels/lruKernel.h @@ -17,9 +17,10 @@ #pragma once #include "tensorrt_llm/common/assert.h" +#include "tensorrt_llm/common/config.h" + +TRTLLM_NAMESPACE_BEGIN -namespace tensorrt_llm -{ namespace kernels { @@ -56,4 +57,5 @@ template void invokeRGLRUUpdate(lruParams& params, cudaStream_t stream); } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/mambaConv1dKernels.cu b/cpp/tensorrt_llm/kernels/mambaConv1dKernels.cu index 8e58d80ffa..e7489b29cf 100644 --- a/cpp/tensorrt_llm/kernels/mambaConv1dKernels.cu +++ b/cpp/tensorrt_llm/kernels/mambaConv1dKernels.cu @@ -26,6 +26,7 @@ #include "mambaConv1dKernels.h" #include "tensorrt_llm/common/assert.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaBf16Fallbacks.cuh" #include "tensorrt_llm/common/cudaTypeUtils.cuh" @@ -97,8 +98,8 @@ __device__ static inline void cp_wait_group() #endif } -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { @@ -1318,4 +1319,5 @@ template void invokeMambaConv1dGeneration<__nv_bfloat16>(MambaConv1dParamsBase& #endif } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/mambaConv1dKernels.h b/cpp/tensorrt_llm/kernels/mambaConv1dKernels.h index 4fb0d2dec4..2c7eadc5b0 100644 --- a/cpp/tensorrt_llm/kernels/mambaConv1dKernels.h +++ b/cpp/tensorrt_llm/kernels/mambaConv1dKernels.h @@ -17,10 +17,11 @@ #pragma once #include "tensorrt_llm/common/assert.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaUtils.h" -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { @@ -48,4 +49,5 @@ template void invokeMambaConv1dGeneration(MambaConv1dParamsBase& params, cudaStream_t stream); } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/mlaChunkedPrefill.cu b/cpp/tensorrt_llm/kernels/mlaChunkedPrefill.cu index 97fd88a50e..cc06fe4bc1 100644 --- a/cpp/tensorrt_llm/kernels/mlaChunkedPrefill.cu +++ b/cpp/tensorrt_llm/kernels/mlaChunkedPrefill.cu @@ -16,6 +16,7 @@ #include "mlaChunkedPrefill.cuh" #include "tensorrt_llm/common/assert.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaTypeUtils.cuh" #include "tensorrt_llm/common/mathUtils.h" #include @@ -290,8 +291,8 @@ __global__ void loadChunkedKVCacheForMLAKernel(T* output_kv_ptr, T* output_k_pe_ } // namespace -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { @@ -351,4 +352,5 @@ INSTANTIATE_MLA_CHUNKED_PREFILL_KERNEL(half); INSTANTIATE_MLA_CHUNKED_PREFILL_KERNEL(float); INSTANTIATE_MLA_CHUNKED_PREFILL_KERNEL(__nv_bfloat16); } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/mlaChunkedPrefill.cuh b/cpp/tensorrt_llm/kernels/mlaChunkedPrefill.cuh index 551e6d79a5..84ff1821e2 100644 --- a/cpp/tensorrt_llm/kernels/mlaChunkedPrefill.cuh +++ b/cpp/tensorrt_llm/kernels/mlaChunkedPrefill.cuh @@ -14,10 +14,11 @@ * limitations under the License. */ +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/kernels/kvCacheUtils.h" -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { // merged_attn [q_total_len, H=128, D=128] (T) @@ -38,4 +39,5 @@ void invokeMLALoadChunkedKV(T* output_kv_ptr, T* output_k_pe_ptr, KVBlockArray c int64_t const* cu_ctx_chunked_len, int64_t const* chunked_ld_global_offset, int lora_size, int rope_size, int max_seq_len, float const* kv_scale_quant_orig_ptr, cudaStream_t stream); } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/mlaKernels.cu b/cpp/tensorrt_llm/kernels/mlaKernels.cu index d678cbe082..8acd92a3c6 100644 --- a/cpp/tensorrt_llm/kernels/mlaKernels.cu +++ b/cpp/tensorrt_llm/kernels/mlaKernels.cu @@ -14,6 +14,7 @@ * limitations under the License. */ +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaBf16Wrapper.h" #include "tensorrt_llm/common/cudaTypeUtils.cuh" #include "tensorrt_llm/common/cudaUtils.h" @@ -31,8 +32,8 @@ using namespace tensorrt_llm::common; -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { @@ -1139,4 +1140,4 @@ INSTANTIATE_RW_KVCACHE_MLA(__nv_bfloat16, __nv_fp8_e4m3); } // namespace kernels -} // namespace tensorrt_llm +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/mlaKernels.h b/cpp/tensorrt_llm/kernels/mlaKernels.h index ce6f4b1bfa..de458857bd 100644 --- a/cpp/tensorrt_llm/kernels/mlaKernels.h +++ b/cpp/tensorrt_llm/kernels/mlaKernels.h @@ -16,6 +16,7 @@ #pragma once +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaUtils.h" #include "tensorrt_llm/kernels/kvCacheUtils.h" #include "tensorrt_llm/kernels/unfusedAttentionKernels.h" @@ -24,8 +25,8 @@ #include #include -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { @@ -133,4 +134,5 @@ void invokeMLARopeAppendPagedKVAssignQ(KVBlockArray& kv_cache, T* q_ptr, T* late float2 const* cos_sin_cache, size_t head_num, int nope_size, int rope_size, int lora_size, float const* kv_scale_orig_quant_ptr, cudaStream_t stream); } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/moeAlignKernels.cu b/cpp/tensorrt_llm/kernels/moeAlignKernels.cu index ae54aa5f4c..4cb4cfb2f0 100644 --- a/cpp/tensorrt_llm/kernels/moeAlignKernels.cu +++ b/cpp/tensorrt_llm/kernels/moeAlignKernels.cu @@ -18,13 +18,17 @@ #include "moeAlignKernels.h" #include "tensorrt_llm/common/assert.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaUtils.h" +#include "tensorrt_llm/kernels/moeCommKernelsCommon.h" #include #define CEILDIV(x, y) (((x) + (y) -1) / (y)) #define WARP_SIZE 32 -namespace tensorrt_llm::kernels +TRTLLM_NAMESPACE_BEGIN + +namespace kernels { template @@ -277,4 +281,6 @@ void invokeMoeAlignBlockSize(void const* topk_ids, int32_t topk_ids_dtype_size, } } -} // namespace tensorrt_llm::kernels +} // namespace kernels + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/moeAlignKernels.h b/cpp/tensorrt_llm/kernels/moeAlignKernels.h index 1cf048858d..0f730271d0 100644 --- a/cpp/tensorrt_llm/kernels/moeAlignKernels.h +++ b/cpp/tensorrt_llm/kernels/moeAlignKernels.h @@ -16,10 +16,13 @@ #pragma once +#include "tensorrt_llm/common/config.h" #include #include -namespace tensorrt_llm::kernels +TRTLLM_NAMESPACE_BEGIN + +namespace kernels { /** @@ -43,4 +46,6 @@ void invokeMoeAlignBlockSize(void const* topk_ids, int32_t topk_ids_dtype_size, int32_t* expert_ids, int32_t* num_tokens_post_pad, int32_t num_experts, int32_t block_size, int32_t numel, int32_t max_num_tokens_padded, cudaStream_t stream); -} // namespace tensorrt_llm::kernels +} // namespace kernels + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/moeCommKernelsCommon.h b/cpp/tensorrt_llm/kernels/moeCommKernelsCommon.h index 7d4310764b..0993c987e6 100644 --- a/cpp/tensorrt_llm/kernels/moeCommKernelsCommon.h +++ b/cpp/tensorrt_llm/kernels/moeCommKernelsCommon.h @@ -15,10 +15,11 @@ */ #pragma once +#include "tensorrt_llm/common/config.h" #include -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { @@ -44,4 +45,5 @@ struct MoeExpertParallelInfo }; } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/moeLoadBalance/moeLoadBalanceCommon.h b/cpp/tensorrt_llm/kernels/moeLoadBalance/moeLoadBalanceCommon.h index d3e8063a04..7c8aa86c22 100644 --- a/cpp/tensorrt_llm/kernels/moeLoadBalance/moeLoadBalanceCommon.h +++ b/cpp/tensorrt_llm/kernels/moeLoadBalance/moeLoadBalanceCommon.h @@ -16,8 +16,9 @@ */ #pragma once -namespace tensorrt_llm -{ +#include "tensorrt_llm/common/config.h" +TRTLLM_NAMESPACE_BEGIN + namespace kernels { @@ -89,4 +90,5 @@ struct MoePlacementInfo }; } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/moeLoadBalance/moeLoadBalanceKernels.cu b/cpp/tensorrt_llm/kernels/moeLoadBalance/moeLoadBalanceKernels.cu index 6c3440e9a2..1f5a9bb8e5 100644 --- a/cpp/tensorrt_llm/kernels/moeLoadBalance/moeLoadBalanceKernels.cu +++ b/cpp/tensorrt_llm/kernels/moeLoadBalance/moeLoadBalanceKernels.cu @@ -14,6 +14,7 @@ * limitations under the License. */ +#include "tensorrt_llm/common/config.h" #include #include #include @@ -24,8 +25,8 @@ namespace cg = cooperative_groups; -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { @@ -665,4 +666,5 @@ void moeSetSignalForGpuStageHost(MoeLoadBalanceSingleLayerSignal* signal, int64_ } } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/moeLoadBalance/moeLoadBalanceKernels.h b/cpp/tensorrt_llm/kernels/moeLoadBalance/moeLoadBalanceKernels.h index 85acd1fb68..29c6ed5373 100644 --- a/cpp/tensorrt_llm/kernels/moeLoadBalance/moeLoadBalanceKernels.h +++ b/cpp/tensorrt_llm/kernels/moeLoadBalance/moeLoadBalanceKernels.h @@ -16,10 +16,11 @@ */ #pragma once +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/kernels/moeLoadBalance/moeLoadBalanceCommon.h" -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { @@ -133,4 +134,5 @@ void moeWaitSignalForCpuStageHost(MoeLoadBalanceSingleLayerSignal* signal); void moeSetSignalForGpuStageHost(MoeLoadBalanceSingleLayerSignal* signal, int64_t iterId, bool enableStatistic); } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/moePrepareKernels.cu b/cpp/tensorrt_llm/kernels/moePrepareKernels.cu index b401746857..f657f60086 100644 --- a/cpp/tensorrt_llm/kernels/moePrepareKernels.cu +++ b/cpp/tensorrt_llm/kernels/moePrepareKernels.cu @@ -15,6 +15,7 @@ */ #include "moePrepareKernels.h" +#include "tensorrt_llm/common/config.h" #include @@ -24,7 +25,9 @@ namespace cg = cooperative_groups; -namespace tensorrt_llm::kernels +TRTLLM_NAMESPACE_BEGIN + +namespace kernels { namespace moe_prepare @@ -374,4 +377,6 @@ size_t getMoePrepareWorkspaceSize(int epSize) } // namespace moe_prepare -} // namespace tensorrt_llm::kernels +} // namespace kernels + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/moePrepareKernels.h b/cpp/tensorrt_llm/kernels/moePrepareKernels.h index c7a095e394..ef33b4c6af 100644 --- a/cpp/tensorrt_llm/kernels/moePrepareKernels.h +++ b/cpp/tensorrt_llm/kernels/moePrepareKernels.h @@ -16,6 +16,7 @@ #pragma once +#include "tensorrt_llm/common/config.h" #include #include "tensorrt_llm/common/cudaUtils.h" @@ -23,7 +24,9 @@ #define DEBUG_PIPELINE 0 -namespace tensorrt_llm::kernels +TRTLLM_NAMESPACE_BEGIN + +namespace kernels { namespace moe_prepare @@ -87,4 +90,6 @@ size_t getMoePrepareWorkspaceSize(int epSize); } // namespace moe_prepare -} // namespace tensorrt_llm::kernels +} // namespace kernels + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/moeTopKFuncs.cuh b/cpp/tensorrt_llm/kernels/moeTopKFuncs.cuh index 665086c7dc..c94ff267e5 100644 --- a/cpp/tensorrt_llm/kernels/moeTopKFuncs.cuh +++ b/cpp/tensorrt_llm/kernels/moeTopKFuncs.cuh @@ -17,6 +17,7 @@ #pragma once #ifndef TRTLLM_MOETOPKFUNCS_CUH_H #define TRTLLM_MOETOPKFUNCS_CUH_H +#include "tensorrt_llm/common/config.h" #include #include @@ -24,7 +25,9 @@ #include "tensorrt_llm/kernels/archCondition.h" -namespace tensorrt_llm::kernels +TRTLLM_NAMESPACE_BEGIN + +namespace kernels { namespace reduce_topk @@ -281,5 +284,7 @@ __forceinline__ __device__ void reduceTopK(cg::thread_block_tile con #undef TOPK_SWAP } // namespace reduce_topk -} // namespace tensorrt_llm::kernels +} // namespace kernels + +TRTLLM_NAMESPACE_END #endif // TRTLLM_MOETOPKFUNCS_CUH_H diff --git a/cpp/tensorrt_llm/kernels/moe_utils.cuh b/cpp/tensorrt_llm/kernels/moe_utils.cuh index ad8fce9fbd..bf35db9bbd 100644 --- a/cpp/tensorrt_llm/kernels/moe_utils.cuh +++ b/cpp/tensorrt_llm/kernels/moe_utils.cuh @@ -17,8 +17,9 @@ #pragma once -namespace tensorrt_llm -{ +#include "tensorrt_llm/common/config.h" +TRTLLM_NAMESPACE_BEGIN + namespace kernels { @@ -45,4 +46,5 @@ __device__ inline int64_t findTotalEltsLessThanTarget(T const* sorted_indices, i } } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/multiHeadAttentionCommon.h b/cpp/tensorrt_llm/kernels/multiHeadAttentionCommon.h index a3363388f3..74c27759d7 100644 --- a/cpp/tensorrt_llm/kernels/multiHeadAttentionCommon.h +++ b/cpp/tensorrt_llm/kernels/multiHeadAttentionCommon.h @@ -17,11 +17,12 @@ #pragma once #include "tensorrt_llm/common/assert.h" +#include "tensorrt_llm/common/config.h" #include #include -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { @@ -122,4 +123,5 @@ static constexpr int kIdxScaleSoftmaxPtr = 0; static constexpr int kIdxScaleSoftmaxLog2Ptr = 1; } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/noAuxTcKernels.cu b/cpp/tensorrt_llm/kernels/noAuxTcKernels.cu index b132a54b5f..19eb4be4c1 100644 --- a/cpp/tensorrt_llm/kernels/noAuxTcKernels.cu +++ b/cpp/tensorrt_llm/kernels/noAuxTcKernels.cu @@ -16,6 +16,7 @@ */ #include "moeTopKFuncs.cuh" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaTypeUtils.cuh" #include "tensorrt_llm/common/envUtils.h" #include "tensorrt_llm/kernels/noAuxTcKernels.h" @@ -26,7 +27,9 @@ namespace cg = cooperative_groups; using namespace tensorrt_llm::common; -namespace tensorrt_llm::kernels +TRTLLM_NAMESPACE_BEGIN + +namespace kernels { static constexpr int WARP_SIZE = 32; static constexpr int NumKimiK2Experts = 384; @@ -334,4 +337,6 @@ INSTANTIATE_NOAUX_TC(__nv_bfloat16, float, __nv_bfloat16, int32_t); INSTANTIATE_NOAUX_TC(__nv_bfloat16, half, __nv_bfloat16, int32_t); #endif -} // namespace tensorrt_llm::kernels +} // namespace kernels + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/noAuxTcKernels.h b/cpp/tensorrt_llm/kernels/noAuxTcKernels.h index e79ceee4f4..dfe6908723 100644 --- a/cpp/tensorrt_llm/kernels/noAuxTcKernels.h +++ b/cpp/tensorrt_llm/kernels/noAuxTcKernels.h @@ -17,12 +17,15 @@ #pragma once +#include "tensorrt_llm/common/config.h" #include #include #include "tensorrt_llm/common/cudaUtils.h" -namespace tensorrt_llm::kernels +TRTLLM_NAMESPACE_BEGIN + +namespace kernels { template @@ -30,4 +33,6 @@ void invokeNoAuxTc(InputT* scores, BiasT* bias, OutputT* topk_values, IdxT* topk int64_t const num_experts, int64_t const n_group, int64_t const topk_group, int64_t const topk, double const routed_scaling_factor, cudaStream_t const stream = 0); -} // namespace tensorrt_llm::kernels +} // namespace kernels + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/penaltyKernels.cu b/cpp/tensorrt_llm/kernels/penaltyKernels.cu index a85f174208..08154c70c8 100644 --- a/cpp/tensorrt_llm/kernels/penaltyKernels.cu +++ b/cpp/tensorrt_llm/kernels/penaltyKernels.cu @@ -14,11 +14,12 @@ * limitations under the License. */ -#include "tensorrt_llm/kernels/penaltyKernels.h" - +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/logger.h" + #include "tensorrt_llm/common/reduceKernelUtils.cuh" #include "tensorrt_llm/kernels/decodingCommon.h" +#include "tensorrt_llm/kernels/penaltyKernels.h" #include "tensorrt_llm/layers/defaultDecodingParams.h" #include @@ -27,7 +28,9 @@ using namespace tensorrt_llm::common; using namespace tensorrt_llm::runtime; -namespace tensorrt_llm::kernels +TRTLLM_NAMESPACE_BEGIN + +namespace kernels { __device__ bool almostEqual(float a, float b, float epsilon) @@ -262,4 +265,6 @@ template void invokeBatchApplyPenalty(InvokeBatchApplyPenaltyParams const template void invokeBatchApplyPenalty(InvokeBatchApplyPenaltyParams const& params); -} // namespace tensorrt_llm::kernels +} // namespace kernels + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/penaltyKernels.h b/cpp/tensorrt_llm/kernels/penaltyKernels.h index c6ab87951d..b8f2309957 100644 --- a/cpp/tensorrt_llm/kernels/penaltyKernels.h +++ b/cpp/tensorrt_llm/kernels/penaltyKernels.h @@ -15,12 +15,15 @@ */ #pragma once +#include "tensorrt_llm/common/config.h" #include #include "tensorrt_llm/kernels/decodingCommon.h" #include "tensorrt_llm/runtime/common.h" -namespace tensorrt_llm::kernels +TRTLLM_NAMESPACE_BEGIN + +namespace kernels { template @@ -57,4 +60,6 @@ struct InvokeBatchApplyPenaltyParams template void invokeBatchApplyPenalty(InvokeBatchApplyPenaltyParams const& params); -} // namespace tensorrt_llm::kernels +} // namespace kernels + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/penaltyTypes.h b/cpp/tensorrt_llm/kernels/penaltyTypes.h index 79ab634967..e8d8a9201b 100644 --- a/cpp/tensorrt_llm/kernels/penaltyTypes.h +++ b/cpp/tensorrt_llm/kernels/penaltyTypes.h @@ -17,13 +17,14 @@ #pragma once #include "tensorrt_llm/common/assert.h" +#include "tensorrt_llm/common/config.h" #include #include #include -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { @@ -56,4 +57,5 @@ inline std::pair getLimitsPenalty(DecodingPenaltyType penaltyType) return std::make_pair(fltMin, fltMax); } } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/preQuantScaleKernel.cu b/cpp/tensorrt_llm/kernels/preQuantScaleKernel.cu index 1219d371f8..ede009307e 100644 --- a/cpp/tensorrt_llm/kernels/preQuantScaleKernel.cu +++ b/cpp/tensorrt_llm/kernels/preQuantScaleKernel.cu @@ -14,11 +14,12 @@ * See the License for the specific language governing permissions and * limitations under the License. */ +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/kernels/moe_utils.cuh" #include "tensorrt_llm/kernels/preQuantScaleKernel.h" -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { namespace @@ -206,4 +207,5 @@ INSTANTIATE_PREQUANT_SCALE_PER_EXPERT(__nv_bfloat16, __nv_fp8_e4m3); #endif } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/preQuantScaleKernel.h b/cpp/tensorrt_llm/kernels/preQuantScaleKernel.h index 47183b79be..8d4a9eef77 100644 --- a/cpp/tensorrt_llm/kernels/preQuantScaleKernel.h +++ b/cpp/tensorrt_llm/kernels/preQuantScaleKernel.h @@ -16,6 +16,7 @@ */ #pragma once +#include "tensorrt_llm/common/config.h" #include #include @@ -30,8 +31,8 @@ #include #include -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { @@ -45,4 +46,5 @@ void apply_per_channel_scale_per_expert_kernel_launcher(T_out* smoothed_act, T_i int const num_experts_per_node, int64_t const* num_valid_tokens_ptr, cudaStream_t stream); } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/qserveGemm.h b/cpp/tensorrt_llm/kernels/qserveGemm.h index e5aa0bdb31..f9b374067e 100644 --- a/cpp/tensorrt_llm/kernels/qserveGemm.h +++ b/cpp/tensorrt_llm/kernels/qserveGemm.h @@ -17,10 +17,11 @@ #pragma once #include "tensorrt_llm/common/assert.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaUtils.h" -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { namespace qserve @@ -71,4 +72,5 @@ public: } // namespace qserve } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/qserveGemmPerChannel.cu b/cpp/tensorrt_llm/kernels/qserveGemmPerChannel.cu index 23432cb030..d7fa4939f3 100644 --- a/cpp/tensorrt_llm/kernels/qserveGemmPerChannel.cu +++ b/cpp/tensorrt_llm/kernels/qserveGemmPerChannel.cu @@ -22,11 +22,12 @@ // } #include "qserveGemm.h" +#include "tensorrt_llm/common/config.h" #include #include -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { namespace qserve @@ -605,4 +606,5 @@ void QServeGemmRunner::gemmPerChannel(ParamsPerChannel const& params, cudaStream } // namespace qserve } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/qserveGemmPerGroup.cu b/cpp/tensorrt_llm/kernels/qserveGemmPerGroup.cu index 4ffebc2e27..e2f25c57ba 100644 --- a/cpp/tensorrt_llm/kernels/qserveGemmPerGroup.cu +++ b/cpp/tensorrt_llm/kernels/qserveGemmPerGroup.cu @@ -21,11 +21,11 @@ // } #include "qserveGemm.h" +#include "tensorrt_llm/common/config.h" #include #include -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN namespace kernels { @@ -663,4 +663,5 @@ size_t QServeGemmRunner::getWorkspaceSize(int const m, int const n, int const k) } // namespace qserve } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/quantization.cu b/cpp/tensorrt_llm/kernels/quantization.cu index 78248214c1..3941277dfa 100644 --- a/cpp/tensorrt_llm/kernels/quantization.cu +++ b/cpp/tensorrt_llm/kernels/quantization.cu @@ -15,6 +15,7 @@ */ #include "tensorrt_llm/common/assert.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaTypeUtils.cuh" #include "tensorrt_llm/common/cudaUtils.h" #include "tensorrt_llm/common/envUtils.h" @@ -26,8 +27,8 @@ using namespace tensorrt_llm::common; -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { @@ -429,4 +430,5 @@ template void invokeFP4Quantization<__nv_fp8_e4m3, 32>(int b, int m, int n, __nv #endif } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/quantization.cuh b/cpp/tensorrt_llm/kernels/quantization.cuh index 665ec2b42e..5a645e36f1 100644 --- a/cpp/tensorrt_llm/kernels/quantization.cuh +++ b/cpp/tensorrt_llm/kernels/quantization.cuh @@ -15,6 +15,7 @@ */ #include "tensorrt_llm/common/assert.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaTypeUtils.cuh" #include "tensorrt_llm/common/cudaUtils.h" #include "tensorrt_llm/common/quantTypeUtils.cuh" @@ -24,8 +25,8 @@ using namespace tensorrt_llm::common; -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { @@ -902,4 +903,5 @@ quantize_with_block_size( __global__ void block_scale_interleave_kernel( int numbatches, int numRows, int numCols, uint8_t const* SFIn, uint8_t* SFOutput); } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/quantization.h b/cpp/tensorrt_llm/kernels/quantization.h index 70776b2790..e571a40a16 100644 --- a/cpp/tensorrt_llm/kernels/quantization.h +++ b/cpp/tensorrt_llm/kernels/quantization.h @@ -15,13 +15,12 @@ */ #pragma once +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/quantization.h" #include #include -namespace tensorrt_llm -{ - +TRTLLM_NAMESPACE_BEGIN enum class QuantizationSFLayout { // Block scale factors are stored in swizzled layout for cutlass FP4 kernel. Scale factor @@ -93,4 +92,5 @@ void computePerTokenGlobalScaleForFP4Quantization(int b, int m, int n, T const* float* globalScale, int multiProcessorCount, cudaStream_t stream = 0); } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/recoverFromRingAtten.cu b/cpp/tensorrt_llm/kernels/recoverFromRingAtten.cu index 050f99efda..b2355aa8d8 100644 --- a/cpp/tensorrt_llm/kernels/recoverFromRingAtten.cu +++ b/cpp/tensorrt_llm/kernels/recoverFromRingAtten.cu @@ -14,6 +14,7 @@ * limitations under the License. */ +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaTypeUtils.cuh" #include "tensorrt_llm/kernels/recoverFromRingAtten.h" @@ -23,8 +24,8 @@ using namespace tensorrt_llm::common; -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { @@ -138,4 +139,5 @@ INSTANTIATE_RECOVER_RA(half); INSTANTIATE_RECOVER_RA(__nv_bfloat16); #endif } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/recoverFromRingAtten.h b/cpp/tensorrt_llm/kernels/recoverFromRingAtten.h index 86ca60c2ab..9d433d0714 100644 --- a/cpp/tensorrt_llm/kernels/recoverFromRingAtten.h +++ b/cpp/tensorrt_llm/kernels/recoverFromRingAtten.h @@ -16,13 +16,14 @@ #pragma once +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaUtils.h" #include #include #include -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { @@ -31,4 +32,5 @@ void invokeRecoverFromRA(Tout* accu_output, float* accu_softmax_stats, Tout* out int h, int d, int* cu_seqlens, cudaStream_t stream); } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/rmsnormKernels.cu b/cpp/tensorrt_llm/kernels/rmsnormKernels.cu index c30280bf0d..8dfb6e6ade 100644 --- a/cpp/tensorrt_llm/kernels/rmsnormKernels.cu +++ b/cpp/tensorrt_llm/kernels/rmsnormKernels.cu @@ -14,6 +14,7 @@ * limitations under the License. */ +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaTypeUtils.cuh" #include "tensorrt_llm/common/quantTypeUtils.cuh" #include "tensorrt_llm/common/reduceKernelUtils.cuh" @@ -21,8 +22,8 @@ using namespace tensorrt_llm::common; -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { @@ -283,4 +284,5 @@ INSTANTIATE_GENERAL_RMSNORM(__nv_bfloat16, __nv_fp8_e4m3); #endif } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/rmsnormKernels.h b/cpp/tensorrt_llm/kernels/rmsnormKernels.h index df3ca6f665..fca852c898 100644 --- a/cpp/tensorrt_llm/kernels/rmsnormKernels.h +++ b/cpp/tensorrt_llm/kernels/rmsnormKernels.h @@ -16,14 +16,15 @@ #pragma once +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaUtils.h" #include "tensorrt_llm/common/quantization.h" #include #include #include -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { @@ -34,4 +35,5 @@ void invokeGeneralRmsNorm(T* out, T const* input, T const* gamma, T const* beta, float* sum_per_token = nullptr, QuantT* out_quant = nullptr); } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/sageAttentionKernels.cu b/cpp/tensorrt_llm/kernels/sageAttentionKernels.cu index e45a7bb97f..fceea61041 100644 --- a/cpp/tensorrt_llm/kernels/sageAttentionKernels.cu +++ b/cpp/tensorrt_llm/kernels/sageAttentionKernels.cu @@ -14,6 +14,7 @@ * limitations under the License. */ #include "tensorrt_llm/common/assert.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaBf16Wrapper.h" #include "tensorrt_llm/common/cudaFp8Utils.h" #include "tensorrt_llm/common/cudaUtils.h" @@ -24,8 +25,8 @@ using namespace tensorrt_llm::common; -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { @@ -610,4 +611,5 @@ void unpadding( } } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/sageAttentionKernels.h b/cpp/tensorrt_llm/kernels/sageAttentionKernels.h index c2039206a5..4ef82e5b15 100644 --- a/cpp/tensorrt_llm/kernels/sageAttentionKernels.h +++ b/cpp/tensorrt_llm/kernels/sageAttentionKernels.h @@ -15,13 +15,14 @@ */ #pragma once +#include "tensorrt_llm/common/config.h" #include #include #include #include -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { template = 11050) #include + #else #include "3rdparty/cub/cub.cuh" #endif #include "tensorrt_llm/common/assert.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaUtils.h" #include "tensorrt_llm/common/memoryUtils.h" #include "tensorrt_llm/kernels/samplingTopPKernels.h" @@ -35,8 +37,8 @@ using namespace tensorrt_llm::common; -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { @@ -1466,4 +1468,5 @@ template size_t getAirTopPWorkspaceSize(int32_t batchSize, int32_t vocabSi template uint32_t calcAirTopPBlockNum(int batchSize, int len, int smCnt, bool isDeterministic); template uint32_t calcAirTopPBlockNum(int batchSize, int len, int smCnt, bool isDeterministic); } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/samplingTopKKernels.cu b/cpp/tensorrt_llm/kernels/samplingTopKKernels.cu index 1c5d8446de..c175e708fb 100644 --- a/cpp/tensorrt_llm/kernels/samplingTopKKernels.cu +++ b/cpp/tensorrt_llm/kernels/samplingTopKKernels.cu @@ -19,10 +19,12 @@ #error CUDART_VERSION Undefined! #elif (CUDART_VERSION >= 11050) #include + #else #include "3rdparty/cub/cub.cuh" #endif +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/logger.h" #include "tensorrt_llm/common/memoryUtils.h" #include "tensorrt_llm/common/reduceKernelUtils.cuh" @@ -31,7 +33,9 @@ using namespace tensorrt_llm::common; using namespace tensorrt_llm::runtime; -namespace tensorrt_llm::kernels +TRTLLM_NAMESPACE_BEGIN + +namespace kernels { template @@ -471,4 +475,6 @@ void invokeSetupTopKTopPRuntimeArgs(SizeType32 batchSize, ScatterDecodingParamEn } } -} // namespace tensorrt_llm::kernels +} // namespace kernels + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/samplingTopKKernels.h b/cpp/tensorrt_llm/kernels/samplingTopKKernels.h index ace034dc43..cb7f835f4d 100644 --- a/cpp/tensorrt_llm/kernels/samplingTopKKernels.h +++ b/cpp/tensorrt_llm/kernels/samplingTopKKernels.h @@ -17,12 +17,15 @@ #pragma once #include "tensorrt_llm/common/assert.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/memoryUtils.h" #include "tensorrt_llm/kernels/decodingCommon.h" #include "tensorrt_llm/runtime/common.h" #include -namespace tensorrt_llm::kernels +TRTLLM_NAMESPACE_BEGIN + +namespace kernels { static constexpr runtime::SizeType32 TOP_K_MAX = 1024; @@ -302,4 +305,6 @@ __device__ __host__ inline void setupTopKTopPRuntimeArgOne(runtime::SizeType32 b } } -} // namespace tensorrt_llm::kernels +} // namespace kernels + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/samplingTopPKernels.cu b/cpp/tensorrt_llm/kernels/samplingTopPKernels.cu index 5c9e6945c9..d7a8d66ecf 100644 --- a/cpp/tensorrt_llm/kernels/samplingTopPKernels.cu +++ b/cpp/tensorrt_llm/kernels/samplingTopPKernels.cu @@ -17,10 +17,12 @@ #error CUDART_VERSION Undefined! #elif (CUDART_VERSION >= 11050) #include + #else #include "3rdparty/cub/cub.cuh" #endif +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaUtils.h" #include "tensorrt_llm/common/memoryUtils.h" #include "tensorrt_llm/common/reduceKernelUtils.cuh" @@ -30,7 +32,9 @@ using namespace tensorrt_llm::common; using namespace tensorrt_llm::runtime; -namespace tensorrt_llm::kernels +TRTLLM_NAMESPACE_BEGIN + +namespace kernels { __global__ void topPInitialize(TokenIdType* topPIdValBuf, SizeType32* topPOffsetBuf, SizeType32* beginTopPOffsetBuf, SizeType32 batchSize, SizeType32 vocabSize) @@ -515,4 +519,6 @@ void invokeSetTopPRuntimeArgs(SizeType32 batchSize, ScatterDecodingParamEntry -namespace tensorrt_llm::kernels +TRTLLM_NAMESPACE_BEGIN + +namespace kernels { template struct TopPSamplingKernelParams @@ -188,4 +191,6 @@ void invokeSetTopPRuntimeArgs(runtime::SizeType32 batchSize, ScatterDecodingPara ScatterDecodingParamEntry topP, bool* skipDecodePtr, float* initialTopPPtr, runtime::SizeType32 const* batchSlotsPtr, bool onDevice, cudaStream_t stream = nullptr); -} // namespace tensorrt_llm::kernels +} // namespace kernels + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/selectiveScan/bmmchunk.h b/cpp/tensorrt_llm/kernels/selectiveScan/bmmchunk.h index ea4f052032..0d8226026e 100644 --- a/cpp/tensorrt_llm/kernels/selectiveScan/bmmchunk.h +++ b/cpp/tensorrt_llm/kernels/selectiveScan/bmmchunk.h @@ -16,6 +16,7 @@ #pragma once +#include "tensorrt_llm/common/config.h" #include #include #include @@ -27,8 +28,8 @@ #include "CudaType.h" #include "Poly.h" -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { @@ -896,6 +897,6 @@ static inline BmmChunkKernelFunc getBmmChunkKernel(int B_, int L_, int H_, int P } } // namespace kernels -} // namespace tensorrt_llm +TRTLLM_NAMESPACE_END // vim: ts=2 sw=2 sts=2 et sta diff --git a/cpp/tensorrt_llm/kernels/selectiveScan/chunkcumsum.h b/cpp/tensorrt_llm/kernels/selectiveScan/chunkcumsum.h index 30a1a2c5f9..0b990e5942 100644 --- a/cpp/tensorrt_llm/kernels/selectiveScan/chunkcumsum.h +++ b/cpp/tensorrt_llm/kernels/selectiveScan/chunkcumsum.h @@ -16,6 +16,7 @@ #pragma once +#include "tensorrt_llm/common/config.h" #include #include @@ -25,8 +26,8 @@ #include "CudaType.h" #include "Poly.h" -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { @@ -361,6 +362,6 @@ static inline ChunkCumsumKernelFunc getChunkCumsumKernel(int B_, int L_, int H_, } } // namespace kernels -} // namespace tensorrt_llm +TRTLLM_NAMESPACE_END // vim: ts=2 sw=2 sts=2 et sta diff --git a/cpp/tensorrt_llm/kernels/selectiveScan/chunkscan.h b/cpp/tensorrt_llm/kernels/selectiveScan/chunkscan.h index cc81fb5094..3360560c6f 100644 --- a/cpp/tensorrt_llm/kernels/selectiveScan/chunkscan.h +++ b/cpp/tensorrt_llm/kernels/selectiveScan/chunkscan.h @@ -16,6 +16,7 @@ #pragma once +#include "tensorrt_llm/common/config.h" #include #include #include @@ -27,8 +28,8 @@ #include "CudaType.h" #include "Poly.h" -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { @@ -2285,6 +2286,6 @@ static inline ChunkScanKernelFunc getChunkScanKernel(int B_, int L_, int H_, int } } // namespace kernels -} // namespace tensorrt_llm +TRTLLM_NAMESPACE_END // vim: ts=2 sw=2 sts=2 et sta diff --git a/cpp/tensorrt_llm/kernels/selectiveScan/chunkstate.h b/cpp/tensorrt_llm/kernels/selectiveScan/chunkstate.h index 1664f0062c..66c0826a69 100644 --- a/cpp/tensorrt_llm/kernels/selectiveScan/chunkstate.h +++ b/cpp/tensorrt_llm/kernels/selectiveScan/chunkstate.h @@ -16,6 +16,7 @@ #pragma once +#include "tensorrt_llm/common/config.h" #include #include #include @@ -27,8 +28,8 @@ #include "CudaType.h" #include "Poly.h" -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { @@ -2260,6 +2261,6 @@ static inline ChunkStateKernelFunc getChunkStateKernel(int B_, int L_, int H_, i } } // namespace kernels -} // namespace tensorrt_llm +TRTLLM_NAMESPACE_END // vim: ts=2 sw=2 sts=2 et sta diff --git a/cpp/tensorrt_llm/kernels/selectiveScan/instantiation/bmmchunk_bf16.cu b/cpp/tensorrt_llm/kernels/selectiveScan/instantiation/bmmchunk_bf16.cu index 4d8acd59de..935cbd3743 100644 --- a/cpp/tensorrt_llm/kernels/selectiveScan/instantiation/bmmchunk_bf16.cu +++ b/cpp/tensorrt_llm/kernels/selectiveScan/instantiation/bmmchunk_bf16.cu @@ -15,15 +15,16 @@ */ #include "../bmmchunk.h" +#include "tensorrt_llm/common/config.h" + +TRTLLM_NAMESPACE_BEGIN -namespace tensorrt_llm -{ namespace kernels { GetBmmChunkKernelFunc getBmmChunkKernel_bf16 = getBmmChunkKernel; } // namespace kernels -} // namespace tensorrt_llm +TRTLLM_NAMESPACE_END // vim: ts=2 sw=2 sts=2 et sta diff --git a/cpp/tensorrt_llm/kernels/selectiveScan/instantiation/bmmchunk_fp16.cu b/cpp/tensorrt_llm/kernels/selectiveScan/instantiation/bmmchunk_fp16.cu index 096a2fec11..4b24405a47 100644 --- a/cpp/tensorrt_llm/kernels/selectiveScan/instantiation/bmmchunk_fp16.cu +++ b/cpp/tensorrt_llm/kernels/selectiveScan/instantiation/bmmchunk_fp16.cu @@ -15,15 +15,16 @@ */ #include "../bmmchunk.h" +#include "tensorrt_llm/common/config.h" + +TRTLLM_NAMESPACE_BEGIN -namespace tensorrt_llm -{ namespace kernels { GetBmmChunkKernelFunc getBmmChunkKernel_fp16 = getBmmChunkKernel; } // namespace kernels -} // namespace tensorrt_llm +TRTLLM_NAMESPACE_END // vim: ts=2 sw=2 sts=2 et sta diff --git a/cpp/tensorrt_llm/kernels/selectiveScan/instantiation/chunkcumsum_bf16_bf16.cu b/cpp/tensorrt_llm/kernels/selectiveScan/instantiation/chunkcumsum_bf16_bf16.cu index 43fda3c64a..25b8cea5da 100644 --- a/cpp/tensorrt_llm/kernels/selectiveScan/instantiation/chunkcumsum_bf16_bf16.cu +++ b/cpp/tensorrt_llm/kernels/selectiveScan/instantiation/chunkcumsum_bf16_bf16.cu @@ -15,15 +15,16 @@ */ #include "../chunkcumsum.h" +#include "tensorrt_llm/common/config.h" + +TRTLLM_NAMESPACE_BEGIN -namespace tensorrt_llm -{ namespace kernels { GetChunkCumsumKernelFunc getChunkCumsumKernel_bf16_bf16 = getChunkCumsumKernel; } // namespace kernels -} // namespace tensorrt_llm +TRTLLM_NAMESPACE_END // vim: ts=2 sw=2 sts=2 et sta diff --git a/cpp/tensorrt_llm/kernels/selectiveScan/instantiation/chunkcumsum_bf16_fp32.cu b/cpp/tensorrt_llm/kernels/selectiveScan/instantiation/chunkcumsum_bf16_fp32.cu index ab7c214f8e..6dce67340f 100644 --- a/cpp/tensorrt_llm/kernels/selectiveScan/instantiation/chunkcumsum_bf16_fp32.cu +++ b/cpp/tensorrt_llm/kernels/selectiveScan/instantiation/chunkcumsum_bf16_fp32.cu @@ -15,15 +15,16 @@ */ #include "../chunkcumsum.h" +#include "tensorrt_llm/common/config.h" + +TRTLLM_NAMESPACE_BEGIN -namespace tensorrt_llm -{ namespace kernels { GetChunkCumsumKernelFunc getChunkCumsumKernel_bf16_fp32 = getChunkCumsumKernel; } // namespace kernels -} // namespace tensorrt_llm +TRTLLM_NAMESPACE_END // vim: ts=2 sw=2 sts=2 et sta diff --git a/cpp/tensorrt_llm/kernels/selectiveScan/instantiation/chunkcumsum_fp16_fp16.cu b/cpp/tensorrt_llm/kernels/selectiveScan/instantiation/chunkcumsum_fp16_fp16.cu index bf3c78a9c3..c008cbec65 100644 --- a/cpp/tensorrt_llm/kernels/selectiveScan/instantiation/chunkcumsum_fp16_fp16.cu +++ b/cpp/tensorrt_llm/kernels/selectiveScan/instantiation/chunkcumsum_fp16_fp16.cu @@ -15,15 +15,16 @@ */ #include "../chunkcumsum.h" +#include "tensorrt_llm/common/config.h" + +TRTLLM_NAMESPACE_BEGIN -namespace tensorrt_llm -{ namespace kernels { GetChunkCumsumKernelFunc getChunkCumsumKernel_fp16_fp16 = getChunkCumsumKernel; } // namespace kernels -} // namespace tensorrt_llm +TRTLLM_NAMESPACE_END // vim: ts=2 sw=2 sts=2 et sta diff --git a/cpp/tensorrt_llm/kernels/selectiveScan/instantiation/chunkcumsum_fp16_fp32.cu b/cpp/tensorrt_llm/kernels/selectiveScan/instantiation/chunkcumsum_fp16_fp32.cu index 30c6ac7266..18ca02aad4 100644 --- a/cpp/tensorrt_llm/kernels/selectiveScan/instantiation/chunkcumsum_fp16_fp32.cu +++ b/cpp/tensorrt_llm/kernels/selectiveScan/instantiation/chunkcumsum_fp16_fp32.cu @@ -15,15 +15,16 @@ */ #include "../chunkcumsum.h" +#include "tensorrt_llm/common/config.h" + +TRTLLM_NAMESPACE_BEGIN -namespace tensorrt_llm -{ namespace kernels { GetChunkCumsumKernelFunc getChunkCumsumKernel_fp16_fp32 = getChunkCumsumKernel; } // namespace kernels -} // namespace tensorrt_llm +TRTLLM_NAMESPACE_END // vim: ts=2 sw=2 sts=2 et sta diff --git a/cpp/tensorrt_llm/kernels/selectiveScan/instantiation/chunkscan_bf16_bf16.cu b/cpp/tensorrt_llm/kernels/selectiveScan/instantiation/chunkscan_bf16_bf16.cu index ac12abea52..0cae8b68ac 100644 --- a/cpp/tensorrt_llm/kernels/selectiveScan/instantiation/chunkscan_bf16_bf16.cu +++ b/cpp/tensorrt_llm/kernels/selectiveScan/instantiation/chunkscan_bf16_bf16.cu @@ -15,15 +15,16 @@ */ #include "../chunkscan.h" +#include "tensorrt_llm/common/config.h" + +TRTLLM_NAMESPACE_BEGIN -namespace tensorrt_llm -{ namespace kernels { GetChunkScanKernelFunc getChunkScanKernel_bf16_bf16 = getChunkScanKernel; } // namespace kernels -} // namespace tensorrt_llm +TRTLLM_NAMESPACE_END // vim: ts=2 sw=2 sts=2 et sta diff --git a/cpp/tensorrt_llm/kernels/selectiveScan/instantiation/chunkscan_bf16_fp32.cu b/cpp/tensorrt_llm/kernels/selectiveScan/instantiation/chunkscan_bf16_fp32.cu index 2c85472a0d..b91a175a09 100644 --- a/cpp/tensorrt_llm/kernels/selectiveScan/instantiation/chunkscan_bf16_fp32.cu +++ b/cpp/tensorrt_llm/kernels/selectiveScan/instantiation/chunkscan_bf16_fp32.cu @@ -15,15 +15,16 @@ */ #include "../chunkscan.h" +#include "tensorrt_llm/common/config.h" + +TRTLLM_NAMESPACE_BEGIN -namespace tensorrt_llm -{ namespace kernels { GetChunkScanKernelFunc getChunkScanKernel_bf16_fp32 = getChunkScanKernel; } // namespace kernels -} // namespace tensorrt_llm +TRTLLM_NAMESPACE_END // vim: ts=2 sw=2 sts=2 et sta diff --git a/cpp/tensorrt_llm/kernels/selectiveScan/instantiation/chunkscan_fp16_fp16.cu b/cpp/tensorrt_llm/kernels/selectiveScan/instantiation/chunkscan_fp16_fp16.cu index 8c330cf815..bf5f7d21a5 100644 --- a/cpp/tensorrt_llm/kernels/selectiveScan/instantiation/chunkscan_fp16_fp16.cu +++ b/cpp/tensorrt_llm/kernels/selectiveScan/instantiation/chunkscan_fp16_fp16.cu @@ -15,15 +15,16 @@ */ #include "../chunkscan.h" +#include "tensorrt_llm/common/config.h" + +TRTLLM_NAMESPACE_BEGIN -namespace tensorrt_llm -{ namespace kernels { GetChunkScanKernelFunc getChunkScanKernel_fp16_fp16 = getChunkScanKernel; } // namespace kernels -} // namespace tensorrt_llm +TRTLLM_NAMESPACE_END // vim: ts=2 sw=2 sts=2 et sta diff --git a/cpp/tensorrt_llm/kernels/selectiveScan/instantiation/chunkscan_fp16_fp32.cu b/cpp/tensorrt_llm/kernels/selectiveScan/instantiation/chunkscan_fp16_fp32.cu index 7c4f11af70..e65f40073e 100644 --- a/cpp/tensorrt_llm/kernels/selectiveScan/instantiation/chunkscan_fp16_fp32.cu +++ b/cpp/tensorrt_llm/kernels/selectiveScan/instantiation/chunkscan_fp16_fp32.cu @@ -15,15 +15,16 @@ */ #include "../chunkscan.h" +#include "tensorrt_llm/common/config.h" + +TRTLLM_NAMESPACE_BEGIN -namespace tensorrt_llm -{ namespace kernels { GetChunkScanKernelFunc getChunkScanKernel_fp16_fp32 = getChunkScanKernel; } // namespace kernels -} // namespace tensorrt_llm +TRTLLM_NAMESPACE_END // vim: ts=2 sw=2 sts=2 et sta diff --git a/cpp/tensorrt_llm/kernels/selectiveScan/instantiation/chunkstate_bf16.cu b/cpp/tensorrt_llm/kernels/selectiveScan/instantiation/chunkstate_bf16.cu index 7f7e224f2b..98bdcacd8c 100644 --- a/cpp/tensorrt_llm/kernels/selectiveScan/instantiation/chunkstate_bf16.cu +++ b/cpp/tensorrt_llm/kernels/selectiveScan/instantiation/chunkstate_bf16.cu @@ -15,15 +15,16 @@ */ #include "../chunkstate.h" +#include "tensorrt_llm/common/config.h" + +TRTLLM_NAMESPACE_BEGIN -namespace tensorrt_llm -{ namespace kernels { GetChunkStateKernelFunc getChunkStateKernel_bf16 = getChunkStateKernel; } // namespace kernels -} // namespace tensorrt_llm +TRTLLM_NAMESPACE_END // vim: ts=2 sw=2 sts=2 et sta diff --git a/cpp/tensorrt_llm/kernels/selectiveScan/instantiation/chunkstate_fp16.cu b/cpp/tensorrt_llm/kernels/selectiveScan/instantiation/chunkstate_fp16.cu index 7c247c5b32..32a70b8698 100644 --- a/cpp/tensorrt_llm/kernels/selectiveScan/instantiation/chunkstate_fp16.cu +++ b/cpp/tensorrt_llm/kernels/selectiveScan/instantiation/chunkstate_fp16.cu @@ -15,15 +15,16 @@ */ #include "../chunkstate.h" +#include "tensorrt_llm/common/config.h" + +TRTLLM_NAMESPACE_BEGIN -namespace tensorrt_llm -{ namespace kernels { GetChunkStateKernelFunc getChunkStateKernel_fp16 = getChunkStateKernel; } // namespace kernels -} // namespace tensorrt_llm +TRTLLM_NAMESPACE_END // vim: ts=2 sw=2 sts=2 et sta diff --git a/cpp/tensorrt_llm/kernels/selectiveScan/instantiation/statepassing_bf16.cu b/cpp/tensorrt_llm/kernels/selectiveScan/instantiation/statepassing_bf16.cu index c62ea0c9be..968adee38a 100644 --- a/cpp/tensorrt_llm/kernels/selectiveScan/instantiation/statepassing_bf16.cu +++ b/cpp/tensorrt_llm/kernels/selectiveScan/instantiation/statepassing_bf16.cu @@ -15,15 +15,16 @@ */ #include "../statepassing.h" +#include "tensorrt_llm/common/config.h" + +TRTLLM_NAMESPACE_BEGIN -namespace tensorrt_llm -{ namespace kernels { GetStatePassingKernelFunc getStatePassingKernel_bf16 = getStatePassingKernel; } // namespace kernels -} // namespace tensorrt_llm +TRTLLM_NAMESPACE_END // vim: ts=2 sw=2 sts=2 et sta diff --git a/cpp/tensorrt_llm/kernels/selectiveScan/instantiation/statepassing_fp16.cu b/cpp/tensorrt_llm/kernels/selectiveScan/instantiation/statepassing_fp16.cu index 0627699fda..f3f9e00224 100644 --- a/cpp/tensorrt_llm/kernels/selectiveScan/instantiation/statepassing_fp16.cu +++ b/cpp/tensorrt_llm/kernels/selectiveScan/instantiation/statepassing_fp16.cu @@ -15,15 +15,16 @@ */ #include "../statepassing.h" +#include "tensorrt_llm/common/config.h" + +TRTLLM_NAMESPACE_BEGIN -namespace tensorrt_llm -{ namespace kernels { GetStatePassingKernelFunc getStatePassingKernel_fp16 = getStatePassingKernel; } // namespace kernels -} // namespace tensorrt_llm +TRTLLM_NAMESPACE_END // vim: ts=2 sw=2 sts=2 et sta diff --git a/cpp/tensorrt_llm/kernels/selectiveScan/selectiveScan.cu b/cpp/tensorrt_llm/kernels/selectiveScan/selectiveScan.cu index 28b7cc5198..8f0a323304 100644 --- a/cpp/tensorrt_llm/kernels/selectiveScan/selectiveScan.cu +++ b/cpp/tensorrt_llm/kernels/selectiveScan/selectiveScan.cu @@ -14,6 +14,7 @@ * limitations under the License. */ +#include "tensorrt_llm/common/config.h" #include #include @@ -36,8 +37,8 @@ #include "chunkstate.h" #include "statepassing.h" -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { @@ -628,4 +629,5 @@ INSTANTIATE_SELECTIVE_SCAN_UPDATE_DATA_TYPE(__nv_bfloat16, float); #undef INSTANTIATE_SELECTIVE_SCAN_UPDATE_DATA_TYPE } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/selectiveScan/selectiveScan.h b/cpp/tensorrt_llm/kernels/selectiveScan/selectiveScan.h index 493d56bc5e..88f28b991b 100644 --- a/cpp/tensorrt_llm/kernels/selectiveScan/selectiveScan.h +++ b/cpp/tensorrt_llm/kernels/selectiveScan/selectiveScan.h @@ -31,11 +31,12 @@ #pragma once #include "tensorrt_llm/common/assert.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaDriverWrapper.h" #include "tensorrt_llm/common/cudaUtils.h" -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { @@ -80,4 +81,5 @@ void invokeChunkScan(SSMParamsBase& params, cudaStream_t stream, tensorrt_llm::c template void invokeSelectiveScanUpdate(SSMParamsBase& params, cudaStream_t stream); } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/selectiveScan/statepassing.h b/cpp/tensorrt_llm/kernels/selectiveScan/statepassing.h index 36dbe526fd..a94dd5c363 100644 --- a/cpp/tensorrt_llm/kernels/selectiveScan/statepassing.h +++ b/cpp/tensorrt_llm/kernels/selectiveScan/statepassing.h @@ -16,6 +16,7 @@ #pragma once +#include "tensorrt_llm/common/config.h" #include #include @@ -25,8 +26,8 @@ #include "CudaType.h" #include "Poly.h" -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { @@ -265,6 +266,6 @@ static inline StatePassingKernelFunc getStatePassingKernel(int B_, int L_, int H } } // namespace kernels -} // namespace tensorrt_llm +TRTLLM_NAMESPACE_END // vim: ts=2 sw=2 sts=2 et sta diff --git a/cpp/tensorrt_llm/kernels/sparseAttentionKernels.cu b/cpp/tensorrt_llm/kernels/sparseAttentionKernels.cu index 4d305467b6..6d3fe898d1 100644 --- a/cpp/tensorrt_llm/kernels/sparseAttentionKernels.cu +++ b/cpp/tensorrt_llm/kernels/sparseAttentionKernels.cu @@ -13,11 +13,12 @@ * See the License for the specific language governing permissions and * limitations under the License. */ +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/kernels/sparseAttentionKernels.h" #include -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { template @@ -199,4 +200,5 @@ void invokeGatherKvPageOffsets(int32_t* output_kv_page_offsets, int32_t* output_ kv_page_offsets, seq_lengths, sparse_params, batch_size, tokens_per_page, max_num_pages_per_seq); } } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/sparseAttentionKernels.h b/cpp/tensorrt_llm/kernels/sparseAttentionKernels.h index 29487567d2..6c701a6861 100644 --- a/cpp/tensorrt_llm/kernels/sparseAttentionKernels.h +++ b/cpp/tensorrt_llm/kernels/sparseAttentionKernels.h @@ -15,14 +15,15 @@ */ #pragma once +#include "tensorrt_llm/common/config.h" #include #include #include #include #include -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { @@ -82,4 +83,5 @@ void invokeGatherKvPageOffsets(int32_t* output_kv_page_offsets, // [num_head_kv, int32_t const tokens_per_page, int32_t const max_num_pages_per_seq, cudaStream_t stream); } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/speculativeDecoding/common.cu b/cpp/tensorrt_llm/kernels/speculativeDecoding/common.cu index d474742bbb..4ff9159864 100644 --- a/cpp/tensorrt_llm/kernels/speculativeDecoding/common.cu +++ b/cpp/tensorrt_llm/kernels/speculativeDecoding/common.cu @@ -15,6 +15,7 @@ */ #include "tensorrt_llm/common/assert.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaTypeUtils.cuh" #include "tensorrt_llm/common/cudaUtils.h" #include "tensorrt_llm/common/memoryUtils.h" @@ -35,7 +36,9 @@ using namespace tensorrt_llm::common; using namespace tensorrt_llm::runtime; -namespace tensorrt_llm::kernels::speculative_decoding +TRTLLM_NAMESPACE_BEGIN + +namespace kernels::speculative_decoding { template __global__ void packAcceptedPaths(SizeType32* acceptedLengthsCumSum, SizeType32* pathsOffsets, @@ -485,4 +488,6 @@ template size_t getTypicalAcceptanceWorkspaceSize( template size_t getTypicalAcceptanceWorkspaceSize( SizeType32 batchSize, SizeType32 maxDecodingTokens, SizeType32 vocabSizePadded); -} // namespace tensorrt_llm::kernels::speculative_decoding +} // namespace kernels::speculative_decoding + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/speculativeDecoding/common.h b/cpp/tensorrt_llm/kernels/speculativeDecoding/common.h index 8da35fb054..bedf152e44 100644 --- a/cpp/tensorrt_llm/kernels/speculativeDecoding/common.h +++ b/cpp/tensorrt_llm/kernels/speculativeDecoding/common.h @@ -17,13 +17,16 @@ #pragma once #include "tensorrt_llm/common/assert.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/kernels/decodingCommon.h" #include "tensorrt_llm/runtime/common.h" #include #include #include -namespace tensorrt_llm::kernels::speculative_decoding +TRTLLM_NAMESPACE_BEGIN + +namespace kernels::speculative_decoding { //! \brief Linearly packs accepted paths in memory according to the accceptedLengths and bestPathIds @@ -205,4 +208,6 @@ template size_t getTypicalAcceptanceWorkspaceSize( runtime::SizeType32 batchSize, runtime::SizeType32 maxDecodingTokens, runtime::SizeType32 vocabSizePadded); -} // namespace tensorrt_llm::kernels::speculative_decoding +} // namespace kernels::speculative_decoding + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/speculativeDecoding/draftTokenTreeKernels.cu b/cpp/tensorrt_llm/kernels/speculativeDecoding/draftTokenTreeKernels.cu index 88e6ea977b..7788dc6134 100644 --- a/cpp/tensorrt_llm/kernels/speculativeDecoding/draftTokenTreeKernels.cu +++ b/cpp/tensorrt_llm/kernels/speculativeDecoding/draftTokenTreeKernels.cu @@ -26,14 +26,15 @@ #include "draftTokenTreeKernels.h" #include "tensorrt_llm/common/assert.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaBf16Fallbacks.cuh" #include "tensorrt_llm/common/cudaTypeUtils.cuh" #include "tensorrt_llm/common/cudaUtils.h" using namespace tensorrt_llm::common; -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { @@ -99,4 +100,5 @@ void invokeExtractRealDraftTokens(ExtractRealDraftTokensParam& params, cudaStrea } } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/speculativeDecoding/draftTokenTreeKernels.h b/cpp/tensorrt_llm/kernels/speculativeDecoding/draftTokenTreeKernels.h index be660e554a..67a28e5e2e 100644 --- a/cpp/tensorrt_llm/kernels/speculativeDecoding/draftTokenTreeKernels.h +++ b/cpp/tensorrt_llm/kernels/speculativeDecoding/draftTokenTreeKernels.h @@ -21,12 +21,12 @@ #include #include "tensorrt_llm/common/assert.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaUtils.h" #include "tensorrt_llm/runtime/common.h" -namespace tensorrt_llm -{ -// namespace tensorrt_llm::kernels +TRTLLM_NAMESPACE_BEGIN + namespace kernels { @@ -51,4 +51,4 @@ void invokeExtractRealDraftTokens(ExtractRealDraftTokensParam& params, cudaStrea } // namespace kernels -} // namespace tensorrt_llm +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/speculativeDecoding/eagleDecodingKernels.cu b/cpp/tensorrt_llm/kernels/speculativeDecoding/eagleDecodingKernels.cu index e963033855..d707d286f5 100644 --- a/cpp/tensorrt_llm/kernels/speculativeDecoding/eagleDecodingKernels.cu +++ b/cpp/tensorrt_llm/kernels/speculativeDecoding/eagleDecodingKernels.cu @@ -15,11 +15,13 @@ */ #include "tensorrt_llm/common/assert.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaTypeUtils.cuh" #include "tensorrt_llm/common/cudaUtils.h" #include "tensorrt_llm/common/memoryUtils.h" #include "tensorrt_llm/common/reduceKernelUtils.cuh" #include "tensorrt_llm/kernels/speculativeDecoding/eagleDecodingKernels.h" + #include "tensorrt_llm/kernels/speculativeDecoding/explicitDraftTokensKernels.h" #ifndef CUDART_VERSION #error CUDART_VERSION Undefined! @@ -32,7 +34,9 @@ using namespace tensorrt_llm::common; using namespace tensorrt_llm::runtime; -namespace tensorrt_llm::kernels::speculative_decoding +TRTLLM_NAMESPACE_BEGIN + +namespace kernels::speculative_decoding { namespace { @@ -2321,4 +2325,6 @@ void invokeCopyFinalDraftTokens(SizeType32 batchSize, SizeType32 maxDecodingDraf sync_check_cuda_error(stream); } -} // namespace tensorrt_llm::kernels::speculative_decoding +} // namespace kernels::speculative_decoding + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/speculativeDecoding/eagleDecodingKernels.h b/cpp/tensorrt_llm/kernels/speculativeDecoding/eagleDecodingKernels.h index 7a8b97f679..9cc639917f 100644 --- a/cpp/tensorrt_llm/kernels/speculativeDecoding/eagleDecodingKernels.h +++ b/cpp/tensorrt_llm/kernels/speculativeDecoding/eagleDecodingKernels.h @@ -16,6 +16,7 @@ #pragma once +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/kernels/decodingCommon.h" #include "tensorrt_llm/kernels/speculativeDecoding/common.h" #include "tensorrt_llm/runtime/common.h" @@ -23,7 +24,9 @@ #include #include -namespace tensorrt_llm::kernels::speculative_decoding +TRTLLM_NAMESPACE_BEGIN + +namespace kernels::speculative_decoding { //! \brief Sets pointers to logits in logitsPtrs according to the draftDecodingTokens. @@ -782,4 +785,6 @@ void invokeCopyFinalDraftTokens(runtime::SizeType32 batchSize, runtime::SizeType runtime::TokenIdType const* const* thirdTopKOutputIdsPtrs, runtime::TokenIdType* pluginOutputAllLayersDraftTokenIds, runtime::TokenIdType* pluginOutputDraftTokenIds, runtime::SizeType32* pluginOutputDraftLens, cudaStream_t stream); -} // namespace tensorrt_llm::kernels::speculative_decoding +} // namespace kernels::speculative_decoding + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/speculativeDecoding/explicitDraftTokensKernels.cu b/cpp/tensorrt_llm/kernels/speculativeDecoding/explicitDraftTokensKernels.cu index 27f89b8074..eaab2215f1 100644 --- a/cpp/tensorrt_llm/kernels/speculativeDecoding/explicitDraftTokensKernels.cu +++ b/cpp/tensorrt_llm/kernels/speculativeDecoding/explicitDraftTokensKernels.cu @@ -14,7 +14,9 @@ * limitations under the License. */ +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/memoryUtils.h" + #include "tensorrt_llm/kernels/speculativeDecoding/explicitDraftTokensKernels.h" #ifndef CUDART_VERSION #error CUDART_VERSION Undefined! @@ -30,7 +32,9 @@ using namespace tensorrt_llm::common; using namespace tensorrt_llm::runtime; -namespace tensorrt_llm::kernels::speculative_decoding +TRTLLM_NAMESPACE_BEGIN + +namespace kernels::speculative_decoding { size_t invokeScanGenerationLengths(void* __restrict__ scanTempStorage, size_t scanTempStorageBytes, SizeType32 const* __restrict__ generationLengths, SizeType32* __restrict__ scannedGenerationLengths, @@ -636,4 +640,6 @@ template void invokeCopyProbs(PackExplicitDraftTokensParams const& params, template void invokeCopyProbs(PackExplicitDraftTokensParams<__nv_bfloat16> const& params, cudaStream_t stream); #endif // ENABLE_BF16 -} // namespace tensorrt_llm::kernels::speculative_decoding +} // namespace kernels::speculative_decoding + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/speculativeDecoding/explicitDraftTokensKernels.h b/cpp/tensorrt_llm/kernels/speculativeDecoding/explicitDraftTokensKernels.h index d2ab345cd4..9b56f344c3 100644 --- a/cpp/tensorrt_llm/kernels/speculativeDecoding/explicitDraftTokensKernels.h +++ b/cpp/tensorrt_llm/kernels/speculativeDecoding/explicitDraftTokensKernels.h @@ -17,12 +17,15 @@ #pragma once #include "tensorrt_llm/common/assert.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/runtime/common.h" #include #include #include -namespace tensorrt_llm::kernels::speculative_decoding +TRTLLM_NAMESPACE_BEGIN + +namespace kernels::speculative_decoding { template @@ -374,4 +377,6 @@ void invokeConvertMaskToPackedMask(runtime::SizeType32 batchSize, runtime::SizeType32 const* __restrict__ batchSlots, runtime::SizeType32 maxDraftTokens, runtime::SizeType32 maxGenerationLength, runtime::SizeType32* __restrict__ packedMask, cudaStream_t stream); -} // namespace tensorrt_llm::kernels::speculative_decoding +} // namespace kernels::speculative_decoding + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/speculativeDecoding/externalDraftTokensKernels.cu b/cpp/tensorrt_llm/kernels/speculativeDecoding/externalDraftTokensKernels.cu index d0da906b8b..2f5eeb2c0a 100644 --- a/cpp/tensorrt_llm/kernels/speculativeDecoding/externalDraftTokensKernels.cu +++ b/cpp/tensorrt_llm/kernels/speculativeDecoding/externalDraftTokensKernels.cu @@ -15,10 +15,12 @@ */ #include "tensorrt_llm/common/assert.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaTypeUtils.cuh" #include "tensorrt_llm/common/cudaUtils.h" #include "tensorrt_llm/common/memoryUtils.h" #include "tensorrt_llm/common/reduceKernelUtils.cuh" + #include "tensorrt_llm/kernels/speculativeDecoding/externalDraftTokensKernels.h" #ifndef CUDART_VERSION #error CUDART_VERSION Undefined! @@ -31,7 +33,9 @@ using namespace tensorrt_llm::common; using namespace tensorrt_llm::runtime; -namespace tensorrt_llm::kernels::speculative_decoding +TRTLLM_NAMESPACE_BEGIN + +namespace kernels::speculative_decoding { namespace { @@ -314,4 +318,6 @@ void invokeForwardAcceptedTokens(SizeType32 batchSize, SizeType32 const* batchSl sync_check_cuda_error(stream); TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__); } -} // namespace tensorrt_llm::kernels::speculative_decoding +} // namespace kernels::speculative_decoding + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/speculativeDecoding/externalDraftTokensKernels.h b/cpp/tensorrt_llm/kernels/speculativeDecoding/externalDraftTokensKernels.h index 1dcb8f32b6..92fb3f6898 100644 --- a/cpp/tensorrt_llm/kernels/speculativeDecoding/externalDraftTokensKernels.h +++ b/cpp/tensorrt_llm/kernels/speculativeDecoding/externalDraftTokensKernels.h @@ -16,6 +16,7 @@ #pragma once +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/kernels/decodingCommon.h" #include "tensorrt_llm/kernels/speculativeDecoding/common.h" #include "tensorrt_llm/runtime/common.h" @@ -23,7 +24,9 @@ #include #include -namespace tensorrt_llm::kernels::speculative_decoding +TRTLLM_NAMESPACE_BEGIN + +namespace kernels::speculative_decoding { //! \brief Accepts or rejects draft tokens based on their probability distributions or the equality of draft and target @@ -95,4 +98,6 @@ void invokeForwardAcceptedTokens(runtime::SizeType32 batchSize, runtime::SizeTyp runtime::TokenIdType** idsPtrs, runtime::SizeType32 step, runtime::SizeType32 maxDraftTokens, runtime::TokenIdType const* endIds, FinishedState* finishedOutput, cudaStream_t stream); -} // namespace tensorrt_llm::kernels::speculative_decoding +} // namespace kernels::speculative_decoding + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/speculativeDecoding/kvCacheUpdateKernels.cu b/cpp/tensorrt_llm/kernels/speculativeDecoding/kvCacheUpdateKernels.cu index 2cb22314e2..8d1ca4530d 100644 --- a/cpp/tensorrt_llm/kernels/speculativeDecoding/kvCacheUpdateKernels.cu +++ b/cpp/tensorrt_llm/kernels/speculativeDecoding/kvCacheUpdateKernels.cu @@ -15,6 +15,7 @@ */ #include "kvCacheUpdateKernels.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaUtils.h" #include "tensorrt_llm/kernels/kvCacheUtils.h" @@ -22,7 +23,9 @@ #include #include -namespace tensorrt_llm::kernels::speculative_decoding +TRTLLM_NAMESPACE_BEGIN + +namespace kernels::speculative_decoding { using namespace tensorrt_llm::runtime; @@ -334,4 +337,6 @@ void updateKVBlockArrayDraftTokenLocationSeparateRewind(SizeType32 const* seqAcc canUseOneMoreBlock, stream); } -} // namespace tensorrt_llm::kernels::speculative_decoding +} // namespace kernels::speculative_decoding + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/speculativeDecoding/kvCacheUpdateKernels.h b/cpp/tensorrt_llm/kernels/speculativeDecoding/kvCacheUpdateKernels.h index 69643b0098..f8551db9b7 100644 --- a/cpp/tensorrt_llm/kernels/speculativeDecoding/kvCacheUpdateKernels.h +++ b/cpp/tensorrt_llm/kernels/speculativeDecoding/kvCacheUpdateKernels.h @@ -16,12 +16,15 @@ #pragma once +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/kernels/kvCacheUtils.h" #include "tensorrt_llm/runtime/common.h" #include #include -namespace tensorrt_llm::kernels::speculative_decoding +TRTLLM_NAMESPACE_BEGIN + +namespace kernels::speculative_decoding { using IndexType = int; @@ -205,4 +208,6 @@ void updateKVBlockArrayDraftTokenLocation(runtime::SizeType32 const* seqAccepted runtime::SizeType32 maxKVCacheLen, runtime::SizeType32 maxBlocksPerSeq, runtime::SizeType32 tokensPerBlock, bool canUseOneMoreBlock, cudaStream_t stream); -} // namespace tensorrt_llm::kernels::speculative_decoding +} // namespace kernels::speculative_decoding + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/speculativeDecoding/medusaDecodingKernels.cu b/cpp/tensorrt_llm/kernels/speculativeDecoding/medusaDecodingKernels.cu index 8db96a37d5..c109f28e9a 100644 --- a/cpp/tensorrt_llm/kernels/speculativeDecoding/medusaDecodingKernels.cu +++ b/cpp/tensorrt_llm/kernels/speculativeDecoding/medusaDecodingKernels.cu @@ -15,10 +15,12 @@ */ #include "tensorrt_llm/common/assert.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaTypeUtils.cuh" #include "tensorrt_llm/common/cudaUtils.h" #include "tensorrt_llm/common/memoryUtils.h" #include "tensorrt_llm/common/reduceKernelUtils.cuh" + #include "tensorrt_llm/kernels/speculativeDecoding/medusaDecodingKernels.h" #ifndef CUDART_VERSION #error CUDART_VERSION Undefined! @@ -31,7 +33,9 @@ using namespace tensorrt_llm::common; using namespace tensorrt_llm::runtime; -namespace tensorrt_llm::kernels::speculative_decoding +TRTLLM_NAMESPACE_BEGIN + +namespace kernels::speculative_decoding { namespace { @@ -62,4 +66,6 @@ void scatterMedusaDraftTokens(TokenIdType* treeDraftIds, TokenIdType const* sour scatterMedusaDraftTokens<<>>( treeDraftIds, sourceDraftIds, treeIds, tokensPerStep, batchSlots, maxDecodingTokens); } -} // namespace tensorrt_llm::kernels::speculative_decoding +} // namespace kernels::speculative_decoding + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/speculativeDecoding/medusaDecodingKernels.h b/cpp/tensorrt_llm/kernels/speculativeDecoding/medusaDecodingKernels.h index a284fb16ca..8e79aa653e 100644 --- a/cpp/tensorrt_llm/kernels/speculativeDecoding/medusaDecodingKernels.h +++ b/cpp/tensorrt_llm/kernels/speculativeDecoding/medusaDecodingKernels.h @@ -16,6 +16,7 @@ #pragma once +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/kernels/decodingCommon.h" #include "tensorrt_llm/kernels/speculativeDecoding/common.h" #include "tensorrt_llm/runtime/common.h" @@ -23,7 +24,9 @@ #include #include -namespace tensorrt_llm::kernels::speculative_decoding +TRTLLM_NAMESPACE_BEGIN + +namespace kernels::speculative_decoding { //! \brief assembles draft tokens to treeDraftIds from sourceDraftIds using indices of treeIds @@ -45,4 +48,6 @@ void scatterMedusaDraftTokens(runtime::TokenIdType* treeDraftIds, runtime::Token runtime::SizeType32 const* treeIds, runtime::SizeType32 const* tokensPerStep, runtime::SizeType32 const* batchSlots, runtime::SizeType32 maxDecodingTokens, runtime::SizeType32 batchSize, cudaStream_t stream); -} // namespace tensorrt_llm::kernels::speculative_decoding +} // namespace kernels::speculative_decoding + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/speculativeDecoding/mtpKernels.cu b/cpp/tensorrt_llm/kernels/speculativeDecoding/mtpKernels.cu index 2e370a4900..eb72d69d49 100644 --- a/cpp/tensorrt_llm/kernels/speculativeDecoding/mtpKernels.cu +++ b/cpp/tensorrt_llm/kernels/speculativeDecoding/mtpKernels.cu @@ -26,13 +26,14 @@ #include "mtpKernels.h" #include "tensorrt_llm/common/assert.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaBf16Fallbacks.cuh" #include "tensorrt_llm/common/cudaTypeUtils.cuh" using namespace tensorrt_llm::common; -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { @@ -561,4 +562,5 @@ template void invokeMTPRelaxedAcceptance<__nv_bfloat16>(MTPRelaxedAcceptancePara #endif } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/speculativeDecoding/mtpKernels.h b/cpp/tensorrt_llm/kernels/speculativeDecoding/mtpKernels.h index e19908101f..4beeac53ba 100644 --- a/cpp/tensorrt_llm/kernels/speculativeDecoding/mtpKernels.h +++ b/cpp/tensorrt_llm/kernels/speculativeDecoding/mtpKernels.h @@ -17,15 +17,16 @@ #pragma once +#include "tensorrt_llm/common/assert.h" #include #include -#include "tensorrt_llm/common/assert.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaUtils.h" #include "tensorrt_llm/runtime/common.h" -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + // namespace tensorrt_llm::kernels namespace kernels { @@ -115,4 +116,4 @@ void invokeMTPRelaxedAcceptance(MTPRelaxedAcceptanceParam& params, cudaStream_t } // namespace kernels -} // namespace tensorrt_llm +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/splitkGroupGemm.cu b/cpp/tensorrt_llm/kernels/splitkGroupGemm.cu index e6f6f55f92..6397396ea6 100644 --- a/cpp/tensorrt_llm/kernels/splitkGroupGemm.cu +++ b/cpp/tensorrt_llm/kernels/splitkGroupGemm.cu @@ -21,15 +21,18 @@ #include "cutlass/gemm/device/gemm_universal.h" #include "cutlass/gemm/gemm.h" +#include "tensorrt_llm/common/assert.h" +#include "tensorrt_llm/common/config.h" +#include "tensorrt_llm/common/cudaUtils.h" + +#include "tensorrt_llm/common/memoryUtils.h" #include "tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm/device/splitk_gemm_grouped.h" #include "tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm/kernel/default_splitk_gemm_grouped.h" #include "tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm/kernel/splitk_gemm_grouped.h" -#include "tensorrt_llm/common/assert.h" -#include "tensorrt_llm/common/cudaUtils.h" -#include "tensorrt_llm/common/memoryUtils.h" +TRTLLM_NAMESPACE_BEGIN -namespace tensorrt_llm::kernels +namespace kernels { int64_t inline getGemmCoordSize(int64_t problemCount) @@ -288,4 +291,6 @@ void splitkGroupedGemm(std::vector const& problemSizes } } -} // namespace tensorrt_llm::kernels +} // namespace kernels + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/splitkGroupGemm.h b/cpp/tensorrt_llm/kernels/splitkGroupGemm.h index 8d7af7e4bf..6ada825529 100644 --- a/cpp/tensorrt_llm/kernels/splitkGroupGemm.h +++ b/cpp/tensorrt_llm/kernels/splitkGroupGemm.h @@ -16,10 +16,13 @@ #pragma once #include "cutlass/gemm_coord.h" +#include "tensorrt_llm/common/config.h" #include #include -namespace tensorrt_llm::kernels +TRTLLM_NAMESPACE_BEGIN + +namespace kernels { int64_t getSplitkGroupedGemmParamsWorkSpaceSize(int64_t problem_count); @@ -29,4 +32,6 @@ void splitkGroupedGemm(std::vector const& problem_size void* gemmParamsWorkspace, int64_t gemmParamsWorkSpaceSize, void* gemmWorkSpace, int64_t gemmWorkspaceSize, bool isLoraIn, nvinfer1::DataType dataType, int splitKSlices, int minKN, cudaStream_t stream); -} // namespace tensorrt_llm::kernels +} // namespace kernels + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/stopCriteriaKernels.cu b/cpp/tensorrt_llm/kernels/stopCriteriaKernels.cu index 088e5aff79..ad2e904411 100644 --- a/cpp/tensorrt_llm/kernels/stopCriteriaKernels.cu +++ b/cpp/tensorrt_llm/kernels/stopCriteriaKernels.cu @@ -14,6 +14,7 @@ * limitations under the License. */ +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaUtils.h" #include "tensorrt_llm/common/reduceKernelUtils.cuh" #include "tensorrt_llm/kernels/stopCriteriaKernels.h" @@ -21,8 +22,8 @@ using namespace tensorrt_llm::common; using namespace tensorrt_llm::runtime; -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { @@ -249,4 +250,5 @@ void invokeExplicitEOSCriterion(TokenIdType const** outputIds, TokenIdType const } } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/stopCriteriaKernels.h b/cpp/tensorrt_llm/kernels/stopCriteriaKernels.h index f60ac784e7..dee64cabca 100644 --- a/cpp/tensorrt_llm/kernels/stopCriteriaKernels.h +++ b/cpp/tensorrt_llm/kernels/stopCriteriaKernels.h @@ -15,12 +15,13 @@ */ #pragma once +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/kernels/decodingCommon.h" #include "tensorrt_llm/runtime/common.h" #include -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { //! \brief Sets finished state to FinishedState::FINISHED_STOP_WORDS if any of the stopWords is met. @@ -95,4 +96,5 @@ void invokeExplicitEOSCriterion(runtime::TokenIdType const** outputIds, runtime: runtime::SizeType32 const* batchSlots, runtime::SizeType32 batchSize, runtime::SizeType32 beamWidth, runtime::SizeType32 maxTokensPerStep, cudaStream_t stream); } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/topkLastDim.cu b/cpp/tensorrt_llm/kernels/topkLastDim.cu index e6e4e82c92..3d6e2b730a 100644 --- a/cpp/tensorrt_llm/kernels/topkLastDim.cu +++ b/cpp/tensorrt_llm/kernels/topkLastDim.cu @@ -20,6 +20,7 @@ * introduced in https://dl.acm.org/doi/pdf/10.1145/3581784.3607062 . * Another variant can be found in TopP sampling: cpp/tensorrt_llm/kernels/samplingAirTopPKernels.cu . */ +#include "tensorrt_llm/common/config.h" #include #include "moeTopKFuncs.cuh" @@ -34,8 +35,8 @@ #include #include -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { using SizeType32 = tensorrt_llm::runtime::SizeType32; @@ -1696,4 +1697,5 @@ INSTANTIATE_TOPK_LastDim_DATA_TYPE(__nv_bfloat16); #undef INSTANTIATE_TOPK_LastDim_DATA_TYPE } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/topkLastDim.h b/cpp/tensorrt_llm/kernels/topkLastDim.h index 31f9a12420..08379da40f 100644 --- a/cpp/tensorrt_llm/kernels/topkLastDim.h +++ b/cpp/tensorrt_llm/kernels/topkLastDim.h @@ -17,11 +17,12 @@ #pragma once #include "tensorrt_llm/common/assert.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaUtils.h" #include "tensorrt_llm/runtime/common.h" -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { @@ -35,4 +36,5 @@ void invokeTopkLastDim(runtime::SizeType32 batchSize, runtime::SizeType32 inputL void* workspace, cudaStream_t stream); } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/KernelRunner.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/KernelRunner.cpp index f3b6decd38..b3d1e3a721 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/KernelRunner.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/KernelRunner.cpp @@ -19,16 +19,17 @@ #include #include "KernelRunner.h" -#include "tensorrt_llm/common/assert.h" #include "trtllmGen_bmm_export/BatchedGemmInterface.h" #include "trtllmGen_bmm_export/trtllm/gen/DtypeDecl.h" // DO NOT include cudaUtils.h and logger.h before BatchedGemmInterface.h as it #undef TLLM_LOG_INFO and co. +#include "tensorrt_llm/common/assert.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaUtils.h" #include "tensorrt_llm/common/envUtils.h" #include "tensorrt_llm/common/logger.h" -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { @@ -680,4 +681,5 @@ bool TrtllmGenBatchedGemmRunner::isValidConfigIndex(int32_t configIndex, int32_t } } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/KernelRunner.h b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/KernelRunner.h index 959c500fb2..0cbfa8ef57 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/KernelRunner.h +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/KernelRunner.h @@ -16,14 +16,15 @@ #pragma once +#include "tensorrt_llm/common/config.h" #include #include #include #include "trtllmGen_bmm_export/trtllm/gen/DtypeDecl.h" -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { @@ -126,4 +127,5 @@ private: std::vector mPassingConfigIndices; }; } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/BatchedGemmInterface.h b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/BatchedGemmInterface.h index 1b1ab14a2c..5da0e0f043 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/BatchedGemmInterface.h +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/BatchedGemmInterface.h @@ -16,6 +16,7 @@ */ #pragma once +#include #include #include #include diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/blockScaleMoe/runner.cu b/cpp/tensorrt_llm/kernels/trtllmGenKernels/blockScaleMoe/runner.cu index ba5821a8d2..d348d95cb6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/blockScaleMoe/runner.cu +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/blockScaleMoe/runner.cu @@ -17,14 +17,15 @@ #include "DevKernel.h" #include "RoutingKernel.h" #include "runner.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/KernelRunner.h" #include "tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/trtllm/gen/DtypeDecl.h" #include "tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/trtllm/gen/SfLayoutDecl.h" #include #include -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { namespace trtllmGenFp8BlockScaleMoe @@ -599,4 +600,5 @@ void Runner::run( } // namespace trtllmGenFp8BlockScaleMoe } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/blockScaleMoe/runner.h b/cpp/tensorrt_llm/kernels/trtllmGenKernels/blockScaleMoe/runner.h index 4edad536b5..987b953ee3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/blockScaleMoe/runner.h +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/blockScaleMoe/runner.h @@ -18,6 +18,7 @@ #include "DevKernel.h" #include "RoutingKernel.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaDriverWrapper.h" #include "tensorrt_llm/common/cudaUtils.h" #include "tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/KernelRunner.h" @@ -26,8 +27,8 @@ #include #include -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { namespace trtllmGenFp8BlockScaleMoe @@ -396,4 +397,5 @@ private: } // namespace trtllmGenFp8BlockScaleMoe } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/kernelMetaInfo.h b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/kernelMetaInfo.h index 899769309a..e47ea6c668 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/kernelMetaInfo.h +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/kernelMetaInfo.h @@ -17,3908 +17,14 @@ #pragma once #include "../kernelParams.h" +#include "tensorrt_llm/common/config.h" + +#include + +TRTLLM_NAMESPACE_BEGIN -namespace tensorrt_llm -{ namespace kernels { -// clang-format off - -#define TLLM_GEN_VERSION "10a85386-dirty" -#ifndef EXCLUDE_SM_100 -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvFp16OFp16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvFp16OFp16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvFp16OFp16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvFp16OFp16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvFp16OFp16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvFp16OFp16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin[]; -#endif // EXCLUDE_SM_100 - -#ifndef EXCLUDE_SM_100 -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvFp16OFp16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvFp16OFp16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvFp16OFp16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvFp16OFp16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvFp16OFp16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvFp16OFp16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len; -#endif // EXCLUDE_SM_100 - struct TllmGenFmhaKernelMetaInfo { @@ -3933,9 +39,9 @@ struct TllmGenFmhaKernelMetaInfo int mHeadDimQk; int mHeadDimV; int mSM; - const unsigned char* mCubin; + unsigned char const* mCubin; unsigned int mCubinSize; - const char* mFuncName; + char const* mFuncName; int mSharedMemBytes; int mThreadsPerCTA; int mQkvLayout; @@ -3950,1957 +56,12 @@ struct TllmGenFmhaKernelMetaInfo bool m2CtaMma; bool mSparseMla; bool mSkipsSoftmaxWhenPossible; - const char* sha256; + char const* sha256; }; -static const TllmGenFmhaKernelMetaInfo sTllmGenFmhaKernelMetaInfos[] = { -#ifndef EXCLUDE_SM_100 -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 127216, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "9a176820807b10f588a749383d1c44f23bb3dc25df12e4a923caeab4c9e6bbcb"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP32VarSeqQ128Kv128StaticContext", 127120, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "5a68fe4d3f52972233e7d25ba1cf4b19d88ca536bda57c2a2a3881020e521fa3"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 127216, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "c7025ca4368ca2c5877558bf92c345e554722d7dc6b63cc2a18847ba500bd6a9"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP64VarSeqQ128Kv128StaticContext", 127120, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "ac9dac8402da2d235aee7b15461fa455658b1bae4294e07ac1f07391ba90a72e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 200808, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "104bdec218bc21eed45e189a887643afacb58ab75c7bdef306beb3530d328425"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 196344, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "ba748a2127a77a0347795486f6c1c1afa7c8510881355a984e27d143242801bc"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 167008, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "efad8de250772cdf2d714404888152ed5c7a14f7012afea154f5bf289520bf4f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 162544, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "dd4ec9a714eedb3ffb0955d0a8a299d99528a506b173fc23e88779051a6d26ff"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 127216, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "cc5b83f0b1e3ee121dc9f7c2dcdfb17b55ae37fd27462dca98effdd971f6fdc8"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ128Kv128StaticContext", 127120, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "daf9f9e5be485d97ae8aff12d858b1908d91a2122816df8840cb02a9389c2519"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 169152, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "65ccefad62f3d50738774e6ae297c6fae486bf861db9eb805857af56b080747d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 167008, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "899dd5f6f56c7fa48e52d18c6314232052190307efde0659264ea8f3bd26ec3f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 163664, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "75ff058e28e391081af0e64633aae0a845dbf2195f900b8c3ae60cb083891533"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 162544, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "f4cdd3cd8d536ae5ea8b2719e25c55c75cffc3f003949ca4259f301e6e8ba196"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 200808, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "9db61c77449b6d4f7eec6532b436260222e8c74cf51d1a99fbb6311dde6615d5"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 196344, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "e5a08a95565cc562bf35d3e0fce7c237262bf75ab19c70bcb01111a2a3606dcf"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 167008, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "cf04c039f22f5fc9b51e10c741ea8b6c55514b32b4ae3aa46c077be19bf438a2"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 162544, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "37ba73fe08bf06260d622b70f6fb76459318e475e11eb04e48a7930662edd0f9"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 127216, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "f76548d39932f5dd1dbf6227931db8febea9864a8ca0a5cb33c5bd9a6bd15e40"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ128Kv128StaticContext", 127120, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "09a630e67459114e94b31b24e3f74cb238e102771b842486fbd6c015db21b885"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 169152, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "89aa4e32c12e9a400379b1321f984e480d56987fb4ee1c0b7834f45bc67ba522"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 167008, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "0b7f1f80c34ecee58370a97022d327315db7f8dc25c61e78de9dc514bdbdb2b2"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 163664, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "a0a04ce61b26f2d4bbebd21473089f2509772c212c9f2454754c1130c224c465"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 162544, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "176d5d860be9cae393482dd75170bfe9d146fab044b2dff583ee0391b03bcc42"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 200808, 512, 2, 32, 2, 2, 16, 0, 3, true, false, false, false, false, "56933b78d89af2b5fc937a729e8803c3b66e5bb71fac757a82df23dcd56fe993"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 196344, 512, 2, 32, 2, 2, 8, 0, 3, true, false, false, false, false, "9695062f74aafb70cd2f6f651cc6912d3c111232785c57bdf24b52b466c40e54"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 167008, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false, false, false, "dbee27129094dfc385000640e5792c9ac4ba94b4a86c6a32d6cb00b2a3789bc2"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 162544, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false, false, false, "86358ae483adf6545077aff4bf2e9a547df94cb31e117834af217af47b1f3fe5"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 127216, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "ba58721fff014274d3cca41454593271fb7ceb18d728297be3b26258caa592df"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 127120, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "d3e1a69415eb65ed13f10a56548c56039c8b5fb4700b3c216ee45600db92cc6c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 169152, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false, false, false, "59e61278596daa47fc62e428d284235e91006e3e6f33141ff2fec3b096a4866f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 167008, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false, false, false, "4d21b42c58a098bd82ed43b89213e95231494ae157c4201cc369e5b9f105f14c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 163664, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false, false, false, "eefd8a2b814aeb23d6375c913980eeb11618b87267479e166fba8ac180df76dd"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 162544, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false, false, false, "088d43d76773e80025ae7b74be526cc6c75db5352d1f0eab70450ee0ded62059"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 200808, 512, 2, 64, 2, 2, 16, 0, 3, true, false, false, false, false, "5b5500588afb1ae0b7701fa0ade1a45582d01c22da683c7573c0f5e15ffd11c0"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 196344, 512, 2, 64, 2, 2, 8, 0, 3, true, false, false, false, false, "3ab064eca6ab6410a4c5a50db61b5fa84cdeff359fac4b64f6b84c3ab2ffd695"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 167008, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false, false, false, "a15573bdc8e7def60fde2ec9851acd7b1c47aa715d378a1b6999364ae2edad64"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 162544, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false, false, false, "2dd434eec9fa0d5729dd9bcff69908a27e4fcda92f90cc87e39ecf4993caadda"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 127216, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "f1beed56222ae46a9401d5f2fd19af1acb4206571034380d0de45de600ac49f4"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 127120, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "0e97990087b7b71e0d2fa9a6bd33a5ffdfb89166097cc9800799f0aaebc78c62"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 169152, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false, false, false, "f1b27f666d475a4ce2a147f463e5590244cfafabfcf91d3e1e256ea4bef95e00"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 167008, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false, false, false, "7468ff1a4f007742ec0c55d80a0f6ff43cbecfa9d9ec7b1a8a661defb9a4ecd7"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 163664, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false, false, false, "6dd6f2fd0facc54aa0f09405fd2822e08c277a6a122c5c3ccce96085b621f6e5"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 162544, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false, false, false, "27730c25edb8153b11f3d1483eb05b0e0ffdddd337c17225e9df0273490003fa"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 224576, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "f4bb2a4c5aa62fd750c35d02fc615151323852c96e7db6a5b6f4e946dc9c12c6"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP32VarSeqQ128Kv128StaticContext", 224480, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "afe89c897631fa79c40d8ab41c3e2dfb6b74c8c23fe40f1533a60ee0fb679eb9"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 224576, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "058e709a9984f5d288d5f036b4e17306e4f25a9f2cdc19455f3357040d57af0f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP64VarSeqQ128Kv128StaticContext", 224480, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "eae703d47ed77d0c618064acd9e894603950302b0cbfbbe58546745d5d3a8a69"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 213608, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "9ae2e3d1f7eecd49644300253d81b1182be7aa6cb8642eda67a412ec679b7e4a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 207096, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "5aac2410cf8216e50066d86d26448b7486a213a3b393ee2529cdd0a23648be3e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 180320, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "ca752948487fd3173e12b74318515ff24c2c29dab9911caa9571cffe4c8551cb"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 173808, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "9de13b6f3e674e181a1d6fa83b3379e5d53233c87a8b69260c0435412b59d3ba"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 224576, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "73dc4b25103728d05869b4cfbcae3ad27470cc3c6e529e6b1870b94d64896f75"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ128Kv128StaticContext", 224480, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "863c72a6f1d28afcc0195a0d73354da7e07af5da52ba37aca816b146f670fa91"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 182464, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "fcfa117289d78d87e479d74f400c933e85f9b6a3731a98c208ecc689fc8d38ab"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 180320, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "a5ce0ad9e94fd6b1ef7110f2c98fe6f0165410ed6012d506567f132882a45f51"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 174928, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "07d6ae141516b7d3c9ee1a52d7cd9b2c0fc3fefb1f0ba8a101dcf457c14bf68b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 173808, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "b422236efca62d3480acf30fa6cb08a95601405299111a09467c1a92ef11f0c6"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 213608, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "6392d187a435e3f4de9293ac3825ef580e925ae9a5e3891f72f7d4a523a344d0"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 207096, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "891d89b7d3e1bcf595422342236c1509102e16fc50ce4d1dec3f3a7296fedd52"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 180320, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "3c2e4435a35b45893239b522f1590dcfec6551d9e724a528e084cbbe84045052"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 173808, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "abb4dbe18240b09d2b51c35331c2a9d7922c040f310b9e04841e02ae08bf9b1a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 224576, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "75ef0359bd85686a0d30a884eca260962f2dd5759610207d0924f8ad739d2ec1"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ128Kv128StaticContext", 224480, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "b3c77953d8a748a0475aee6c788a58400ae6b44dca57df3f56cfa55f682ce074"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 182464, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "669bdc46670c721ce56be1f17fddbcd7b9a596b303467cbec628f3309e623a57"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 180320, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "732e0739cd78890ee4a9892bfc5d78e20fcdb8a219ba8d4d8cfe14c1d3a6cebf"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 174928, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "f94e77c36416243876c49d8cd8900bfff7b62091047fdf18dd37c33e1d5eb722"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 173808, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "f303ad74edbda689701f0b68c9d6a31f4213dc8a6c28105584d62cf727e7b755"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 213608, 512, 2, 32, 2, 2, 16, 0, 3, true, false, false, false, false, "8b20864dc92a5fb4bb448c91e95c7a8ae65ec83afc5a62e56e2ad3bb9cfe73cb"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 207096, 512, 2, 32, 2, 2, 8, 0, 3, true, false, false, false, false, "706d6f5c5a101340b007f8dfe4f3056171f53644a825c2d7d1b03ca60d37c5c5"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 180320, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false, false, false, "17f18a2da5fcd46f2b4f41aa7c138207dc6920b6f49fbefe5f8934ecac4ab42a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 173808, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false, false, false, "8b920d35b9de005aae71e786c5d0f8c2e1ff3a4cf0909657397e3b7f02f9ce44"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 224576, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "b552d88d491de085a8468d6724b425ea5ad22ae3e567a9724b6e7451cf7d038b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 224480, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "e0ee2608692b7ce27a3a5a2c60fa0e45f0424e01d80895df144e788c5a73c130"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 182464, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false, false, false, "84feaf256a5f34b34852e3da1686eea074bef3314d861e31b42d94e806f45c91"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 180320, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false, false, false, "51e61d2d8bef61988b1242d79becca1cc8cf86d9020dacc9f1d86736fa8fbdef"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 174928, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false, false, false, "e46903333a5fb00aa0aac43bcd2bb8b08432de27167d70ee5b739622bfd5afca"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 173808, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false, false, false, "bddd3aa9389f091dc709a77bc887cba95435ba18abf5e74b6c0f57a9c91cc781"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 213608, 512, 2, 64, 2, 2, 16, 0, 3, true, false, false, false, false, "b6acefdeeab699b3f6f2eb0d0da419c5d44353251bf0083648d547369bacc831"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 207096, 512, 2, 64, 2, 2, 8, 0, 3, true, false, false, false, false, "33fdce28359c88adfd47829bbd1ace7e4d2b42da776518f3990c747b47b19b19"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 180320, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false, false, false, "d5da56e936d6d97fcd7d5e028f441e86bd5937d249a1b7764865cbd9d2c55732"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 173808, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false, false, false, "74880ad89dc908c813fda616a9b992e2820881515e4922917c681cebc7dafdec"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 224576, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "78d269a80ec7e57a3a439b03f5e8096ef56e3e80ae5ea89dc483c1536f4e5b91"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 224480, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "25b99b3ecba8a22be5bc4d19303d5a43d7d528f065a5cba738823f3677450bd3"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 182464, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false, false, false, "51a9447b33f700b3fdcfea211f2e98eb05389be809d1de0ab79984bac47668a2"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 180320, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false, false, false, "d3c9c96ad45d607343093c382df2d967cba232d87c432b9d5671627a30f05386"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 174928, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false, false, false, "ae8492a52953a1b671b54a366a9f6e8abfb2b6a43e55c01233114c43c2e975bc"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 173808, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false, false, false, "c68c9158c88f0285e1183770f519514df7b3d14710bc59e4f94e68c39cb65f5d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 64752, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "220a7a7960616e00e07c57384faae712c2c73b13d611b9cda562ea8df0c60600"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP32VarSeqQ128Kv128StaticContext", 64656, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "004b32f0077bc3b728ff75a45f9c9c99861751868b70e742f55e2a4c114d26af"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 64752, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "e334e8bffa1e4b53d8426a1ed779e9fe02786c689a1ec480460fb15b9dbfa0cc"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP64VarSeqQ128Kv128StaticContext", 64656, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "ae19d2d2ac022782dc92fac3f42c79e480633422abad50b243cd28e32642b03a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 158840, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "4ca6a1c9cb86de7e69504f8dd0ad52de40f46f6800ecc701df403598386273da"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 155384, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "edfe01cdc13a53eda44df09da4260243accb2eff254324bbc6d95f945a699c9c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 124016, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "6491108be3eaa9d3a305454a852079e7cebd5d4a9dd47c8d5034a71b15bff59c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 120560, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "15053c497b53cdd92add998ebe5e467e95bd57202cb941369b8810a0fd3d5d71"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 64752, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "d902057b573e72c81468ad3e21ab7bc8c06fad6b993290e0f6806ad7f29ee792"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ128Kv128StaticContext", 64656, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "89479aa295fb50821d14b07ac5e6d39badf6af5952ed8b9e75886c0d23e60269"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 125136, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "915571831c4a3baa43d58da0c872c6a5cf63a7076ece7f3b635e92b1c7128f77"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 124016, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "ee4086173bb5fb2b4827793e61c8a75a7b7470a149abb8f4561d2db7ff67cc2b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 121168, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "6e81250777e27749c126497b900d238adfb9d77b6de67ac304933ff58ff4438c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 120560, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "7f8f6f473ed330959a1bf992f7c6663433bbbed2c1dcad97d061eda933cbb553"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 158840, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "01e858a32b62c644fa1ec22739fdf1388ba21e9a7524355ef74b43ea14022404"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 155384, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "fd398c620a37556d84e69b52c17ba9a4feee2fbbe3c97d31d28ff1225b47beba"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 124016, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "3ff194082a2b62f60f7ea6b1152a9cb3dd6dfd8a33983d1f88d965847f1638b2"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 120560, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "52a6710a7b5e7bb005299e77e8c3e740999bbd853e1338a29eeb89076fa8c0c2"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 64752, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "5aaf6b37b523bb1e8c50c42bf51879ec1756c1129cd0b780d68fa6d362ab3c51"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ128Kv128StaticContext", 64656, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "71a4f3d040218f98334725380d3724d3022b14451febea2af9befd6577fa0fc8"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 125136, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "5963136e972a354b423f4b5f87bce97a167e45ecbb90d29b5a9086299e13fa93"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 124016, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "d7bdbfb47e2a6ea950dd36d073964f69e9347ac6b44d44113af8e8c24df51341"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 121168, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "d64d27d8c4e1eeec885fc04871b263c052ff283187ec7cea37ff3fa19d5382bd"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 120560, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "ba999ce4f40bb7dd830a184bca7951dfa8bff71ef8e2fb9a4efdfe178a9dc8d5"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 158840, 512, 2, 32, 2, 2, 16, 0, 3, true, false, false, false, false, "3c5c8b5bd002495102e0ebc01d67e6b6efb8d4c345f464b8a9e3d6d0f6e7079a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 155384, 512, 2, 32, 2, 2, 8, 0, 3, true, false, false, false, false, "b2916eba42f5cf8a1504503b93f7951415a6d1f06d935f4abbb8ef82cb9317d6"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 124016, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false, false, false, "3be5829657bd310d44197385ab64341dd0fda8d85901e061447c309a95ca02e5"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 120560, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false, false, false, "a9c6597ef687ca0dd63c6a0d3b187beff05e35f6d3c1825040d7675eb7dc1d16"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 64752, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "362f02e7698d48b396c92659055af196108dee06fc2bd2e7bbb44173af3c53b2"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 64656, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "6dcdbcaa9f08bb6ba5a158c4580d44b7b059701269e0fa0d1513f880b0bc40e4"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 125136, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false, false, false, "34cf51cf50bda51d4c3620578824ed250fec11159e1124e0ef419ab452067c56"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 124016, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false, false, false, "1e3febea4bc2cd225fa762a1c20b7b58f1a9cc4104924490227c3c6f96c4182c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 121168, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false, false, false, "a17f0d60d9a2bae580742e3bdc2100825e3e3775ee97bb02c9ed155fdc4c3cce"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 120560, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false, false, false, "f500a5dba46d45a27502eebf4d7c074ed03ad213b15197d6d730f926a730b62c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 158840, 512, 2, 64, 2, 2, 16, 0, 3, true, false, false, false, false, "d1be4373212179556613b06d5da19516f858a72e89865d232d3f58fa2b72b038"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 155384, 512, 2, 64, 2, 2, 8, 0, 3, true, false, false, false, false, "fcbd6e0db465b8210c69746b24970733490738f73146c574572013b602beffd0"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 124016, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false, false, false, "b5f134c6a7498e79ba8f825bbf7e4e5d05605ac264c87f0562a7bf2e3079520f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 120560, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false, false, false, "7e821651894ffc8a196bd16514ea1accbffdc74c58f04c99f8741279adb7d691"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 64752, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "a49f8eb61a238ed1d4b24bfa5a438e365ccf22b5b5c115e65e1b9cc2e5099fcf"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 64656, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "8fb263b43dcf76d212423f4fe3291b68ef434a6c64d77888679254887386ad99"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 125136, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false, false, false, "add7f579c80e58a158b1c31e6ee7683fba9cc10f218b92d1f84dbb51c5e89102"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 124016, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false, false, false, "d1ff0e1fcf39b3f5c1b16e45ec330c7e632e22fb798aea6e3fec7bb1b2c9802d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 121168, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false, false, false, "4a038dc6820022d273d4d45781de7b1a6eaeb2fbf7c207c2714fa857dd87b137"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 120560, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false, false, false, "2fbd4767059a0fe411b2f9fa3b344ea9e03c9020baaebfeec7d13f7e7ed2f4bb"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext", 164208, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "0d24488ff215eb5a38cc37495120be92ebe05a3a6878b24003559a8d2d7ac866"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqQ128Kv128StaticContext", 164112, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "225b7dc35c64ccc33d06180243fc945056d116b6c8134c71479cae625f20c44c"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext", 164208, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "3610137ea0fc810d89c5bd9a1c31807ed4f2429ee5ecbec2597e9ffe86b19f2b"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqQ128Kv128StaticContext", 164112, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "304e87adf8ceb391ee1941ab129d96b9cf7c81990faee7c8010f7298f8438bcd"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 164208, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "98bf26c1913472257b02dea317c02bda92ac1e5eeb1bde1783fac95f61ed29cf"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 164112, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "561aed2537e7eaa7a4685a821cff4390f841b30a2a9ca78377daf18e5f24d06d"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 165072, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "c1e1bc69c47c3c80db81b39547ca6502a9bb9422073c1d260b3177b64bbeddba"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext", 164976, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "d0b1ec60d316307cd0c47f96e931ee83233592200a4fcdc4243e6d70a913f598"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 165072, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "66995087c70642b64b813a1b34773eb69dd9d67e214c27136d24e56362feaffa"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext", 164976, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "2d6329538d8fec5b01313090e1ae84ac3ae3ddbc96c0d62823c5ec2a1abc6861"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 165072, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "aea86c6548409f1a47f335d3aec8488390a12f7705fa62a21585469076de0b49"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext", 164976, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "b325448482814d86e32e6bb82ea17a816552cdad61e1223e4efe32f34cfb484b"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 165072, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "0d10b8333b75d3ca64b1308acea71d90ceb6f8f84a169841d03897cc42a6add2"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext", 164976, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "0ef97810ed94b7ff6b0722160d0f9c6980f7a482407726608d48123d734e30c8"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 165072, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "bb06974794b11482c54fa4730ccbf601fed79a0eba283265a05cf565c45110ab"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 164976, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "6e64d1bb568e2dc9fcb7c26b8de6f8cd4893d2730d9d34b0c6a9cdf7faa9fee0"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 165072, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "5745282ece8f2a11f6ff3947557f5e8a25802861285b7c695f1ff5602cf679bc"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 164976, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "f061d7ff7c70186556c98c083424b6e701e210285caa1f8a0a97d32b11d57ae4"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext", 196928, 384, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "39b3703d50571308c760adbfe08c7d7bd4dfffedd2a684b61f446146f1b763fc"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqQ128Kv128StaticContext", 196832, 384, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "2d5b67610c0956e5891007397094db6acc754885402d8e7e3c620999b7779d3c"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext", 196928, 384, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "2576fe18a70c1799b082a945ca406d5e34a04edbc82dbc8ad38e6a0c6246fec1"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqQ128Kv128StaticContext", 196832, 384, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "30b189778153228cc3080c8f5250f192b9950b8a83c171cc17899a4c926ef030"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 196928, 384, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "360a2ee98ca1e7a2450fd3a697ebf40404eb1cad89d627ac22fbbdbd5f4ed032"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 196832, 384, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "851293d99175ebfea8e36cbdb70db56542eb979315896416c56920bd672372f2"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 197792, 384, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "174ea0a369ddad5bc63c79507b146689009d0e5001b3bba9e2b082a6c953cb15"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext", 197696, 384, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "f231c5102782e6f55c3b10d6fddad400ef74213788b6ecfaca1b6f10a0930262"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 197792, 384, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "4c3731a229c255180e2383b0a2f23bc888080795e81cb62fbf82b208381e5240"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext", 197696, 384, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "58df070c76bbaa6ff2323967339e9374979fbd918f5fa479ca664f1ace49e547"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 197792, 384, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "70cdb3642c3a1b4022a5a3cbc90e521205e8570c0d2544d478db5be26561e638"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext", 197696, 384, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "40cfdda4f209761ef3eca907fb84f09130214cab4b6d60ee69ea1672795bfc81"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 197792, 384, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "edd9cbf6540157c4a2b9d15161d79b383a077832f307f3afed73819cf665a481"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext", 197696, 384, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "41f9ed182f80238688349226b59a19f90106ff4b17910cdedd60f37d4e2720ee"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 197792, 384, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "285fd78c597d80fa9c353a00c3cf37ce07a038fad316b91e196905ef4ac5510e"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 197696, 384, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "1b42f5dc3d6e8f39b15eb98d26429a84ebf92c329714e453ec94c5c23a951cb2"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 197792, 384, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "cdcd8368539d349e56a0da3f917d3aea8f1bad85442eca9e71cac099931a5cf4"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 197696, 384, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "3a745ec8ab72d6e90b807cf52d631db44d7b4fbd1490b04ecbdc5ddc8d09f36a"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "f5d7b6911391be1aca7403b8f0dccb8e0b0b42bcdb1f703ea5e6cde9d05dd568"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "cedf712829ed7eedb631e6616b51b20b6c40f7f71bce5aee9ddf4e959ff6d3f0"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "988e2e64cd365677f93c1b9623883c79cb1045ec02aaef2d20635f36e0195ea9"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "128f37705668460ba6579af4ce7bd3864bf474f0cccf9b6444406478f1bd26a9"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "62f9e7bd5ff48d0d4b77ba80d4b1b722b60762b0ec778de638c9b1a2754224f3"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "14d3e3c8c95131bf255fc36e7c6e45773dbab760e485ec731477c0f2dc4a7d02"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "934c2d4fb468df623d74b4632b2a9c939a6f95f849f3c8a1beaf35e669a25e85"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "a22a2df86dc7210614af22e02abea4e8c7eff6916545a5ff7c5d8da7ac443c36"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "6d3548d893121af62638608ea3a0b6ce0dc66243292ffd6324477b6ea9a8c7c6"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "edde20adba8bf537b44729dedebcc263e4d8db0f9ca0754a5ea3971f73e13f69"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "a3a49eafac4fde1df3976a08d56a347361c190ad316d52d53b7e115cc985171f"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "8fcce917d59b60c70af6ef18e57d57e238b15a28b4bddb8a63ab683f86e3378e"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "523fc0b3b8827a56176e7894ba229bdc7c824ad54c0465568042f710f6abcc82"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "88f0843d0c1308fcbd0729b321b0a84aa6cc75f19f7b1788371ebb4d268fc3b9"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "c3c5bb3f3002b8c12a154ebf967c05e8c58cb0deb2f85d83b68a4e2bd5b84982"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "691fc35d4b75894c36e33d3236cfef861a454db736a68756588c31886621dea4"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "b346e12d9b29ce92015c55eec7eedc3943adc616e196ebc8c65684ab8ba211d4"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "ba50b9aea73f4b0a9a1fff264affad591a1307e07d3794e8f018b0f7d8c74993"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext", 196976, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "84a6cb586301499f6bea219ed094e0c87df0217757e940c963a3a4208dedbd61"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext", 196880, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "97b70df2d05f5063c18437f97937d44421606dc2bc605981e0265e1c9f4ce81b"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext", 196976, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "fd0462926b6a21909d90ae77aad39edfcf53b1afc777bafe0c56887cd9d73f19"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128StaticContext", 196880, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "d86e163089210d1517adac7970151574a8c7ec050f55ac8eeb1d01b4091d983b"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 197840, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "af05a175e01e48825c7a4e0e53b38388feb464440e950e0a0e4794538214f631"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128StaticContext", 197744, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "28d3f9a2353563b88f9ba35bbe4a53b22ef53e614da53b426004d06ee64e2332"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 197840, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "4844c027bc8b7b0909276e6538cf58f9b444e5930be9fb026ff9d815df6f7ea5"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128StaticContext", 197744, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "fab3d3b7c808c4c0fad11886427802bc38ec8316a723fa064eea9dbb2547379a"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 197840, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "7468098ee692f39f3afbba7a7e7eecf095eef3c46b67e5ae78dd0227adf119bf"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128StaticContext", 197744, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "9b164b53bdc9f265b27bc47c5aa7c4363ff83fac3e88950ea549308ba5a5e0ae"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 197840, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "d9fea5f24789a73af71c33264f900a0a084619099f35d7aa168655535881b603"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128StaticContext", 197744, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "75372442dc47d1d709bad3ec7c53051c9ccbdb4092993167e30de7cd38ec4d7d"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext", 196976, 512, 0, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "79f17ad517131c43bf9b7fe3bb9b00a55c31a76e7bc506dbe5b45315df484e51"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext", 196880, 512, 0, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "be2d3ad4a316e2a084067d79980221cd36d5ccf0d7286698437eb2cd4fa3cc9c"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext", 196976, 512, 0, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "3d096ca0355ebdde61032cf1c64b2929f96d69db5f19fa8c9ef0eb1e7f7d222c"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128StaticContext", 196880, 512, 0, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "00b046486501488c634cc21e5c52fb203fbd0a3642ccf540f01da82e9b673de2"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "2b0bd4206a8b2992313d99f72e6b9a63926815044b32ed11c0278ffb4e3c17c1"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "25430c0e17ab7265bc64de4505af06de58f3c937c0841a6ae126ebd8f20e4c3d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "b64c18ba0c89a875343843fe48bdfaebf78c611ae0991b29e74e172dded995eb"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "0e7535df2b4da1d28171a268520c5e5c315dc10625b927f2144601fb8fafbe65"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "8bedc7684f06091f019605565a03c006a1ebb06f9fc52e6a64a161042816d8bd"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "c1e9fee0e21fe54eada9e9498e268bc3c9deb6a9bc0ca4a1dc16f1f25ebcb323"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "38c619de4b7e892e8dbe1e9dcd6d5cd1213c0eac5d419c3127ec2139de8feda5"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "728fbf93a0b5da82531843f1dabfa34621daf3ff5a4821707748072b79c5a626"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "43de26822a86d91e1d32c089218118b6f856beff1d1fc8484385c4d842ed6279"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "30b7f6ea197ceaf087fa0a1d66dc21756f81b6a8267b665197f2c96d7139646a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "30a5b34cdaa7fa6b304a6e8965329570c2a4710bb41fe6d8b33de40d2bf73153"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "ad72c0c73ec5cf21ebcc5e0487309c55a322e68c0ad979e42b42912e8377f5ab"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "096018aaa2fc98018ea7e485b6e24a8a9d0d4b70ebec4c22a30a35e004de04a0"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "a151f63cce163b64f5bf01120517e1cc3ec14214d8a0fef41d57a0215a2e39f4"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "33411bce48f1d810602fc49ed92324bd904eef190ca19edb5947021fb5b64169"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "de0b120a994e0c7d94d40b4ec3c78007ca45536867011c58cda26ffadafea397"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "f72b534246e9d5c10c34369557ad096bbd2bc13220ad1b0dfe726457a9c35a27"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "3016c3485aec6bf190897476eee2d83c926def5c1d703f38a105ef4285135f4c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext", 213408, 384, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "d642ca653d460a0e0e5b47d1aee398b13a9e390eca766a0e1efceeb03d12ee82"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "ce99dd160cd8b68ff089326c56218e8293f9229bd0018dba5920121b13beaa45"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext", 213408, 384, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "248b34b50a5991cb56d5468c149188b9782281f61e0ba987c5b9b3b08cfcc72f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "15c9fe05cf708391fd5366db20c132e5d7ebd3738c9615353ac44aa096b1a9c3"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 213408, 384, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "4fbb23e29da3a87eb2e8096af063a96f124e51d78bdc7078021ec44ec171bf72"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "05928c0c6af7af63790b6045624b8b07363578f201d99173ee389d4b1e23fdf0"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "ca768f2dd7d55554ca31e719474a42120084c513c59023a63705118c80e9c919"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "3d86b6fcfb862d696a1ac4b7eaa8fc5a3e8bcc54e85ddf64b1829c17ed299559"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "c6f1bf6be7f6e69ce727aeeb9e6a3a1b8bec3724eb93eba0642d05cd54b2c6f5"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext", 214176, 384, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "091432db802b2b98d120cf3c42cb3e43e0f8b65ff7749fece54eb623d1bb87e6"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "8448bf00d42cf5e0413494f47cab45d228ba1a0f4f2d23b8cad2bc6ab7a5ad16"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "07d0858275796e475f7a67f3863bbab6a1d9e2d5c9bff38436b593fa5f060c4e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "17774a61f32e41d9c79fe01077fa050556bdf9772c737889a183a88d61414583"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext", 214176, 384, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "bd9ce5f85e27131e1914187756dd073be1e946456ef5c7609890c1320ae61dd3"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "a52884fd149d0941e4e9003ae0d1555edd0aafd1e6965f5bd9b6411c49d95e11"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "5b6cd89ea5356ab0a71fe782b8dd14e86426c933114f2e7404fe901312e1386c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "9d0bc5a0a0760522873ba71e5ae364c8c01d0d697c180ddb4c34686872bd78b6"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 214176, 384, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "654f5101e202198fba0c7cf0213b3ea27d550f183b84f3016c393db6aad0771f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext", 41296, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "db2a8093e717f5507972c7c8dfd974f3aed7c1240892998846fa09b5fc33425d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "9940c721d27806545e22979c70b74fa77ec901e253489bbe9bc3efb154dfbb92"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext", 41296, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "7e46fca53b58b0c88889b1d54f07001835b526dca41e16de55db091c61d488a8"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "79891c286236f125a415496ffdec30b9f693482d8153c3d4f6c9da2c5531d4b9"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 41296, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "b0bd4ab823dc67c88fe798c4e6ee75397954b9000252bec56d3c0822eec88382"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "43ccebdf5afb652c0014c739619b905f6a4c23a9e4181866ee598d84982409cf"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "47eb66d6637ccdfa086d8d5014433f1fdad0b49eac00e9018600cc742b78c343"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "b36a831c94558ee1c75d868cf2bdb27474cdfcad9e233743fd9130bb3cc352bc"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "fc6dbc1b0d19ccbfed46bf156c6f4d5dcfe4f117c4de3eadafb7fd9388fbe6bf"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext", 42064, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "ead70acf719412e6d7314a18152565dbac084bf9e5b5e40b0f361036e5676e5e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "62ddf1fc0ad2c695882f71b432ab312168d684455a7de5a8cb4fd18dff39a52b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "f29798b4b467211a096eb1547a123a86d7532c9f505d9833202b5130dd412234"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "46784c1148adfe5d3e54a9ec8a8a8945edee6248de64890bf9bdbb3323c7448c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext", 42064, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "f4b418d98ea96c5eec974ef0fb11d6ad7a2a5fb01e7f31e0eb0ad2607e7faaca"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "e0fed08aedc63377b016eb9d51f56c4c0778ad77bd7ed9065f74b06aa12adefb"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "784cb1116acc421ee239469372c0d3ee5f6b5248c5289a6aa1fc46f8c1351311"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "eb50c70feef78193e29d4cc7c18d22bfae5919523fa5600afc8dc4cea8ce50c1"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 42064, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "3d9743a13bffb04ca16631ef9cc9415b4fb61abd85834948e7b1ec19013dc290"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext", 115024, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "2cce7a54bc71ed307ee92b751b245e5e1a96045d5dc23ac9d0272e8b18e14b7a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext", 114928, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "6faf09ead6025da5688ea9d9497199c96c50112155a3d95411c0ae45221775fb"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext", 115024, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "3134f99b1884abef7d522b565dd6eb1f6bdf4d97e968846e7c056b7aa32819b3"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128StaticContext", 114928, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "1675df4c12784931d487b76785f477286056c94656ff06a12c3a27bdcb5b7d4d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 115888, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "2b5b83d88ed18b9cd246d148750f4dc25d7b99d5f63215eff8828fdfe772436f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128StaticContext", 115792, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "f6d9c1cda8a696f3ba8d0d25e77f11933516d737295d5d72f81039431ede0dfd"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 115888, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "3e2d8eb87f7b80fa5520ba90a81bf419e5479fb11b7eb69570ac29ae139f2664"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128StaticContext", 115792, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "ef48511e3fc3f2131a45923e22c99d730191b68c0654a560cee28c1f839889c5"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 115888, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "65927f03e0c8fb2463e61185eeb4459f7eb6cf3a5dde60e4a1e059443000103d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128StaticContext", 115792, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "c1dc8009edad2b644408047c148b0bf2397b7e86ab8d0d444aff62f4ce099754"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 115888, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "017ae5733367251d3c9485c52b03584dec67dc51840d766e2b52dbfc7c0311a4"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128StaticContext", 115792, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "50d1f70449d09387e02d328657d16fd756ae2eb43ae2cce366bcd8a12ddd8cbc"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext", 115024, 512, 0, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "1a63adee229ad5bd427701ff68c7f659c720d74e814a8feb4eb6ed614f1cddfd"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext", 114928, 512, 0, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "cc44cecd73dccf2a14f0359197ea5988f7ecae9bfb5c7d1828c0df6b4c85e9d9"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext", 115024, 512, 0, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "8ffad205c54903ee7f8479942aa7008c871c778cb6d9e63f5a208c4dd0d07b00"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128StaticContext", 114928, 512, 0, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "06bebcfde741d33afe83e3e480c2ab55f5ca120e22cb613e1da797d745d35cc6"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "8662952ace08cdab7261f5951edffc716eba6cf5824d921d0e35656bd0af5590"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "14b0fb3b21b473584a98440288bfe0d098d55f197b2ad3ebfa27bd90d3b72300"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "c5bb8ba04355cdf068816e65ab590881bdfd181dcda163dead26020908489c84"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "52c4499452e185674a37a9d972c3fa69410edcd97199d17b1016ffc3ad1a77e9"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "4df2e068ccd0e38d5cb2380ce507c695884ee03db17179573ee1a2028f22b3e4"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "44fd7cd29958d6630a555fdf2f63754a8c162e20f63c64e97a0aea62b47b7a19"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "565c94af7b1302931ff0eb325e691f61af5f9959079028f0914e6605f67a768d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "f8705e7c1ade0fa1b742140f26b99159faad1782cf32d86e94b7fd7b7a41d012"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "3aca6a206e4b75966c746275fc543a5e23883eb56887c9b1ac5eba1c54a240c4"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvCausalP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "fbbd81a80c4d5a583c285b90d454c49c114e08aeccfd4464fefef4b116c18921"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "2c904baf842d519d6694278e02c8d6007a8c9d269bae98649c4f8815eddddd5d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "e2e89c166b11ab40dffa4c17e6491ea1d3d70ddaa37e2e458d8a8fc3659ab777"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "3c1d890a72eba4035a1ae299ef88d4fa9b49339bcb06ffbc400f69a9f22d7682"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "ce8d1aa3f4f88ac82f2c67fc265552077cd06bfe4ea5bf7b9a183291c4236244"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "6a42dbe4131fc7a832ccc8afad002e96568612a2997a7e66531f5ec714dc3254"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "9acb343e29a8fc3f5b7d72d2d655aa853e2f8878e49f63a0341127044f0aa7cc"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "98456f49c3b0b96e4262759864a2c0d00400c5d420bdea2813b7acb667717739"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "3fea91b0ffc061f792eda7469778f92297e37b841382af54fd06685f8f3f8939"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqQ128Kv128PersistentContext", 213408, 384, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "12a5de4615caeec946d1c5b05077c9b818624cf210952e297ad1670100192758"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "086778dde4f1a41c3514ed1ed93d59452d0edd8897eeaab0b5ec7a548a5d3d3d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqQ128Kv128PersistentContext", 213408, 384, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "c4217d7f363581ca06c28032f675214950da9dd569ac0b68699b2b7210b645b4"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "c6e982d44fab3b555a12cb80dbcd22098c6bd38ce044f051d0e91cac070c1d2d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 213408, 384, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "29cf1d6ec224753c06deef3936dffd71d031d3f371da9cd6cfc8911cf3f4a34a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "734ae1e9295868c9b4ceab477a37952d3e6cef1ccb324e5ec407885f51cdcba2"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "43d755cdb6868740881da19356a9954826aedcca3361e74cf6aac714eeea568f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "521de02ca65a70565832c1e36ef1cfba2e3732a288b295003df796e8a1811f65"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "890f351adeed7cc61fa32f105d9835fd6ead944170be82463f2ae1d3b4510bc0"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvCausalP64VarSeqQ128Kv128StaticContext", 214176, 384, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "a00b41e480b0b996e4cabcfd9153bb2aa0cfe880e44e3cb2e8a09626438a7492"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "3cf27418b4bd2f68f77c72cc2c4727ff16d22a347cac3286cfd1dfb8e22e5964"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "692b6d122243ffd33a3fe4c2736223b7c4501060ae76f85892c7d3d08a6ae14d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "fda2497590ee4f894b71570f16d4e30dbf756400889eb00b21251adff674dfba"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ128Kv128StaticContext", 214176, 384, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "d3795ef2472b35cca72394dd3652e80b697e050757c3c081b270c6333898d067"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "da22bb8af036d7fbfc381afae1694febd958fb87d419f09d23da1b30a43aed08"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "39929a0ba84a775e65e7fb37fc6451cb30ac94101d1ed3e85ed8c8b639baae37"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "f7fc702d67d904c4303bf60c4963f3a36e5bb4e20581682ec9d81aaea4b56233"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 214176, 384, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "593534de1d7d1184065a67cebd293e0f9bfd6bc7bcdd7981b38cfbfe78085b85"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqQ128Kv128PersistentContext", 41296, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "c68d7d13d63258d49c819a2d63835da3047e30fc4c2046e8aab619b4c60fff8f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "75247495cb3e8d2217e404157625aad7dc27c17db852c7d94a651ae921ea0d05"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqQ128Kv128PersistentContext", 41296, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "4bd0f0f786dd934611e4a0c125da71b24e250e6e258bfa13daa2d0e8b3f5eccb"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "b6010ef4ef174f56485beca5c34cda23bfb9074c66eb92f520ecc921dbbc4dd8"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 41296, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "9e186ae186618cd715045523eccdb97469e0ee3888c0373696f6137f0d9a49cf"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "7b5e9af6391281959ea7cd14dea905f9fcc719c90a49d7132e8865d4fc70a7a7"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "f1b66b7fa225e818fd5f47ff4298e84b1bf36fbad76b35d401f488ad6b394569"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "bc486d0c8030a057f509b671c2dca7500b63e285f13d58154ee7d5eea432fed6"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "1a5328d9f3000ab01f0b655440e3f381163d47a919e6865e138d877d6a1133bc"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvCausalP64VarSeqQ128Kv128StaticContext", 42064, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "d8096296aea9785b9f29889244a74127c0a22d616cc24ee4b05593b404a9f793"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "30a234389152b52c41e6742863dd64249abde514469c3936a1da2ab524ef7fdf"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "8bf67862c52ed34f6e27bbce85ac18222fcb9e96d94e842947625fe517e179ee"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "d7edd2b5cee281bdfd82bbeee470522c5d1083dc5a0bb9b641d9476545160247"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ128Kv128StaticContext", 42064, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "ffa1e0c8c4ae2d277e43b392b8a6c58ae19858168beb96519793e97ba4806276"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "36e133cc7d4be469a9198be5c042636534db7cb5107d004ad59eb0907d7c8f1d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "110fadc31f93c3faaee3e589cf978e2fe8df57d6550bd5aa3bf89ee5050d336f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "0f9a3508e56c111de3a2c44f51d6f1cb38cee388fa8ba6002b13cedb1b0ec9ee"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 42064, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "add8a3e810f284d474ccb89ae3cc4777809312b661c491bed2d27f048032d24b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "0098118e55b9bb97cdc72932c63fc900fd04359f8cf070356f10a71762b0e9db"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "c8dce4ce9cf029588bd2e4aefa26462c26ca9f7a2af6b0f3f1ea0b35d4a6149e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "de35652f4273e1ab0dd9b62767e1e9600cfbd949b3ecd8085eb203eaa1e84eda"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "305773b02ead09e5cc1c91f9f66a382fed595feea5334ebd2d4febf0eb82f735"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "0b93f9d285e124e338c869bdf405eed222ef420497d9e6308e113e3a6fe6d6e7"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "e88918e64a3f645fbc35d7c31c12da173c7dc9071bae338907023eec07e4fe6a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "ab133cc888d89d58b291a08af38a277dd554db10e937f4d957a6fc3d423761c4"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "6c3226b241a4b60211039127c4fa66a35a233e6ebf2c6c8a3c909f5956c8f30b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "6d71cbceb2e250a6416baba68380a3dc3fae0f29920cabf98faa0a07fe710ac2"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvCausalP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "d1c24b376660d3827e1904f3460c81577cdd828b32b3d7b0f4c79dcd142d63b9"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "2182a36ccfc2c6a7d2bce29d8d2617fb7506516650885750974002e96e841d04"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "cfea90b58576b76d456c5448d44835d1f8865bf41a82c7f8d779a3998124af7b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "a64cfb54e986a4a0a55445909ccacebe6f95e8744c6a6ad31283cf426039e4a4"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "3125bc998f20bac6f18139da940800ae165228c9ad5464445519f2614ae6fdd1"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "42d765817be68b343674f4200f72584760a7b8d8e60584f714338e890a785f7f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "6e0629605585a9dc3ca29c94e6d3ecb090f7390aa7a2144dfa946b5cc3ed51a6"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "07121132ab2a614e78a6830fa08219d5b371886d72efd10395a02dfda23b3dfb"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "b1b4784f94fc99ebcb5b7cfbe5eabdd646a2a614166c36a8c49568052a49a114"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqQ128Kv128PersistentContext", 213408, 384, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "9e78013e73a51e7d172b799893916cb7c8ea77eb5f1e77a6e64fc0be2ac755b8"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "bdde2454c5bfb8f89e2b4e3d7a3d46fda9a2094d0d85549b859eee0588123f9f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqQ128Kv128PersistentContext", 213408, 384, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "1d58322ba771aa98b7c3673ce29486e0fc61f0f31b05e3670b051a3d9a9f9ae4"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "7ed8a8f825163e0b7d616dfcab21e70fe4cba85cc4c3b20b96ad062a3f0ce84b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 213408, 384, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "6f6760684dbf8003eec4d1da8c4c7e7c7d16cee2b520267eba662fc6c07a3eb3"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "13d3b2de888cbd6b55c1171ba7a864977fc0ce5b9c5118d1e06b4b55b8566b6f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "1f8f3b98d1e19b952eba0bb3d69992fc3e9ac544352280f4140d603757656c32"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "98c0b51f856c3b5bee1fa54df507a972a514a14b8ecf8d510d383896dd909ae0"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "b26fcab6ed7ac07620491a4984d09f258dbacbd1d69256b542ff94d9156d5833"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvCausalP64VarSeqQ128Kv128StaticContext", 214176, 384, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "0148875e458bc35ccffc73826c276994262b45922e119724056516d32a246094"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "aa5e40cdb0ddc1965baf8a0553ddd1e369fc9d74f566d2214ff2a0d4db410942"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "b22913c662351ed92e1ba0a3f3885e5c095ad6b6ade755e05a62257f50c64c87"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "910f225f987bd7f00f7b7b65454f5927968b18fce3c6f980f786d91ea23b631e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ128Kv128StaticContext", 214176, 384, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "ae2edcf5436a32556461eae6dfb3f3206923d62fe9ae89cc446a202828514cdc"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "fb8b8fb60a3b4687cf5e6e45ca09461f8755e8bf136ddc0d10e46e4a8e201434"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "4620892c4f107aefe0c14b1ce834bd9b3ebfa8774d04f519144b1904ec039e79"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "89f8a5f44ec289b2117f10ed183983cdc4ccaf175818c4be0f8ebc13aef19b8c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 214176, 384, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "a50f858eab3a7eb7e46558588729b40cb4a2f9b1504104fdaadb610eba32eaac"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqQ128Kv128PersistentContext", 41296, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "9f541c23cb94c5f07f373afe77389acfff234285c3ad3f72d0fe5cd94bf0e91f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "43576a7bc2622ba66b34c6ed8ccf3e99c0b63605aecfa298e3491458219f393b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqQ128Kv128PersistentContext", 41296, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "2000aa4985ffb56c987879d7be43a3e4249f771010ef8e9a47b10780308d02f0"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "0aae3897ddf148a43d8dec8bdf233abf0739f21dca2a28f3a1ebc18940b8319d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 41296, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "a4b3002d7bb2131c3e2ae2759591371a80832212af4502a6212e8b8551c67dc8"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "41d810a4b8c5ccfd136174d43fa28761f2839fdd8c379d254d2ac756bb4ad965"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "050f3686439c173e8afb9bb75cac264d23f05c24aea7a1788e134d5db529dda4"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "e3011fafba05b8af8e4b2e70bcb15b50af3636584e6d47b2d091f5d4e1ccd4e1"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "8cec52b70694ef4fed6d485f0b6c1c1a9f4d3cf695e5c94d3c37cb66807b156b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvCausalP64VarSeqQ128Kv128StaticContext", 42064, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "c30fa0179e20e222f3764b0a826aca4bc7eced3d9aaac0f7001d53faf2a0d144"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "7eb9604d6a3e54e4073c1f436f2ed2b9bc91f4ebfba39966168ffe6091c40876"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "5444924f93b394e9c7d00d702675a48d6a0032906e5622285560cddf8f2932ed"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "9ce39bb886e9682bb5b4da9aa69821035949609a3223dd9d9f5955e661edd27d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ128Kv128StaticContext", 42064, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "0ceffc9c18430dfe3360f9608813c70ee8cc566d2246c0d197fb4b8ed2e2a165"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "183b55b7beca1aa78103a4997ec17deba44258ec5c78d493467501ea6f2c0db5"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "654ca58beff79f76d648bac3c772d0872dea61bdaca7aec1eb502889f68747c6"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "8850c7eb360b45aed3b3932732a7b0d744df28ab646508f48247729c79a05b77"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 42064, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "d4f8f441997f4a8992808079f82680ce744c9cd22c023c343a17b1fd5af5788e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "74e3e84be6f7af61f793308d7a6c4d94e62033787aa206b2d1679b398979a2c8"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "77a15d58f1a519a7ea2c0a8f533c76f5b32f3fb3caac4bc34efac95d912ba7f3"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "c312dd519690c409cbe24111cc105c8bccb42cf82738dd69728fc520d714605d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "8dd12d3493d76d7841c887de10c722a8d86e91d1bd6002b318e9f2026fb90414"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "ddff00b269b52581601694c3f49c7ec304ac157ba127d4ca8e469d7a117d028a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "a5b881a7aef755621fee6a045e7e6d0f01f851d28204022e9cc671b2115bd99d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "5685fdbd2a78c40b1a3d5d71573ef043c1dac6309a14d7f0dbdf1adbe30d25b3"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "3bb2083f6a45d45d15e155b472602b80f3467394d906d2c3293825690e334b9d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "d581eea5ccb7fc64288a6e3f55e510b277c0ce0ab099b744f0a0cd9bdd79b9e4"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "6063d1d036bc183a6b8f23a0bcba4e7e4c132545e42337e30e556748861636ba"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "897f3cf32abdef81f60916bec2578355725468ea669b5726137585cf6370355a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "e82bae7ab0c0cecb0974365b6a28c40f327177c434025871c8a55a29ecce73d9"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "2792c2339a973b4a7e46465084f26b24a2624f50fa76cf4cad7b69754c735481"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "0d2c6c5b7ac20e1139c374b28d248677579d733c0a194e81ef9a448ea03962dc"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "af7c49f208648cd5e68d635a21afb148f4213bff2b880426ed1e9ee2f5943108"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "71500417ab4bc8726bfcff3d9b0d832b5d5be807e251770264e8023c78517b23"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "c6e954c0413245aa57ab8246d9b13e2c2ffb48668cc84a0a90bfc381da170fff"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "b5849b44d0b67452731bf31f13865c174acefcae3c065a4c15cfb0ae299e622d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext", 213408, 384, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "1d6622ac02d98a299d01cc9672bc0ff409afb4de43c83ac2f82a36a9562cf960"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "99397875bf3dc892731ba5fa8c3f5b1f17a588a013539dcf07bf7016413cf032"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext", 213408, 384, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "cfc1d94156b7b33f0e4438fe8a15445d45591abe23720c044621e378bac8f4a0"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "2bd42e914aaaefd556b16a1c116e5b6eeb725d4d1300b9a5c9cba396e0591c6e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 213408, 384, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "d431ca520d04fff910366f941a6f90ecf94abc5bc1439a04bd668782c915e979"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "65e1cda0285e8381ec04e1195afb5a96124e451cf5a47038858f32baad6e102f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "71ee27f4c1a45c321e92e63f063e946e6d5dac32da94b3b41031be6c58d73d2c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "b12b4fb38cb5ae7a06b89e91e2f2682da2944bdadd1ae0b71ccf71a362cafbf1"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "9d4ae2617de40e420b6688d972d57e158307829b83e61069d50f9da55f006e18"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext", 214176, 384, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "f0dd2127382a5befbba6555b39a67828734767d15ec1293a6eac9e9377bd4146"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "51733e9ca073a2fd83a52323d640993d3e51b81d59da8b633e68708b5c058e07"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "cc0208bd2c6667a5035b52dc665f222b8570d4674e1234c4a1e43e0cb3a62cce"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "068ec55e1afc1255e3019651f79c0397bd17139d61d12915c0ab0a146699a4f4"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext", 214176, 384, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "bfb842eeecafaff0d24ccd27066b0d0ec324756782496204560b6c1c29fe9655"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "f797c65d3e7d8b904d46a788806739f675a28ab417e94fa8378713f97461fe3b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "0213a682c6ba2cd0fff98cce71e652b0e7b4f85bdcd412b587ab28ea0f424cca"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "04e47b8e3f80c07cf87ef1d5cf926a69f366210a4e4aa671235d3941930d0d48"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 214176, 384, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "3b7ea483a4e76bb008ba12003f96aa1ffb9ffd60d402976a990764b274727fd0"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext", 41296, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "1a016641a00963ec80e7240695743cd1156f94a1069bae597cccfebe5ee26f16"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "352bcd97106ed61be338bea371574ef03636662d77a688342d782a5b17fe799a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext", 41296, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "0b1743b2bf93c531ecbe9ea4109feb3bce8e92bf0ee9bf9978eefe0cd8a6afff"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "3e6ab2572c61239f2190848118b2469ef2aaf317a77c97bb7f416ab00a96c814"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 41296, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "7c1826de40a5db061844f1a35fe055ae7059eb8ce44cbd3b9cf8034e94fae572"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "a1e6d7e0aa1d0415180abc3bfc5726f73eb83f9a62f4bcb01e897755b2ba8dca"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "859f3f1a7a5796fac7b67e52c24a193223ef9e9f18c9623ff172b27323c8e639"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "15abb3a291c20dfd5b35e5512bf1a7164670ec177359f49920ed27af440d8c9b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "40e1505a085d8b60e71f280fea917fd7e43fdbcbba64ff3a9f83762b439da4c7"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext", 42064, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "2a5bea1783750b354da40f249ee1a73446d2a1aec65d0295895ba0ece9a50dd1"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "6fbceb7efe4904474e12257d671b5e4823e5a32278a7569d8d89c04eb110aae0"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "b207f39712183f7c82e2f45495c2e5a25a5ba9989e4513513853f3d718064cc9"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "b0315f8b239d615b582051438f9037a27a55a283a67a445f0863ba838ca7ee03"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext", 42064, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "6bf157c8e0b8c785ae52b61f840c276285da910ea71e8816347b0c2c97468773"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "9ad4f354486b42eb8db9bfb84fe58c3016edc337fcd11b413eac23211f61baf3"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "b042314fe52bb1636fbf256c17ebfdc06eb47b53df2ddc1b30c915d7888f00bf"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "1b9ebce31f1d59f8205eea5b086d0135ea70188478290cb59dd377195e523f20"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 42064, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "85c6c4653149e214702b8118daedbe72f2f575abccb6942281a6705ae0541485"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext", 164208, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "79f353f2c1b8e25c2ff52c38843aaeff4e733a067728f3407c8ec17e76858347"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqQ128Kv128StaticContext", 164112, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "c15139d4a1cbc8934053dbfb0e940f67d77784780563812a51af85bab3973125"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext", 164208, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "0924650423561acf4606b264cc5afe95ec4ab5787c55482d7f52fdaaea8f091d"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqQ128Kv128StaticContext", 164112, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "66a66d41aa2faf8cab56d3bdff104f2e3e577d5a139b40a67a966085ec5851e4"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 164208, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "f4a825c23038a3cccd65658e3e8237646e3b4cee20003657cc98f975620074c8"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 164112, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "aac573a70398870c52400978d8f9752448bb76daa3eefeb308b2780a6ab3b79b"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 165072, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "467cc670546b9ffb4fd72c9dcb9ec47d309ae45c36b963bd118aca319b132040"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext", 164976, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "0f040a61f6a8431e543505bb26ebd5118824942fb1aeec2a3b4084e97b4033c7"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 165072, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "590242ed84abea4a72e39cb766d19b6e9a7a591cd16c5b63051dd7d0cf869760"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext", 164976, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "23d200ba633cc282eb880bba15f4adc6337d6f14db3d78aba7912fa3555edf08"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 165072, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "ee1f6364fcb88f7afdd0f54b928db0c40f09da064aa2172c52d1e9128dabcc4b"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext", 164976, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "d99f6e607668c372bf88d73ca331f53d6474208f33445fd05e08ef93733aeaea"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 165072, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "ca746cd641c4505a964dd7e577db63861233d83923c15631087be9f005d53bd2"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext", 164976, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "740892943888b812359d8565184ad79817361b6c8235939efade9732f0e989c4"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 165072, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "bd0147a41419ef8b97c3a1bb8376ac95af0af2f527ff685005b7980cb5e17d94"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 164976, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "b9528a0580b1d286aa32a463d4eb6db496a65aa85bee311d23785c03b8f8b7f3"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 165072, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "82c3f927d1bccbbcabe0d058f0e3d5d8f157bf38d7ab3f80514a26f88bbd3c00"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 164976, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "367ad2a08c41a21f5dc9ba7578e2aceab9ebf58f69034e40473fac8ab3b6a647"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext", 196928, 384, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "6256cc9041ae8fff88e876bd57fb8c6492b04a16b4b2d774183cd789721a7bfb"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqQ128Kv128StaticContext", 196832, 384, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "71380233e738de2b8dbef1ed978e5dbe20d06c39966397aeeb8dcfcc48accf10"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext", 196928, 384, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "6b851bd8f4826e458d7a9faa52f122721340a35feb3696c85f202e97971ce05a"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqQ128Kv128StaticContext", 196832, 384, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "255981fdc6db62854a539d2fe6db24a9b7800bf91199314eb7710b209f43e5c7"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 196928, 384, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "556f00a3aa41aaad8624a3e72f8a3e9906b971eb097d5ab01914763040a8c27d"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 196832, 384, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "05fcee4b9cc1ab02bf3835fb630068e8fbf1c422c64c7c67629ae3a80cf7a1e8"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 197792, 384, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "a92df6ee417a893872240535368db021cae626d41be0103718a33fecb52aaf48"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext", 197696, 384, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "53348ffe0f0c5f8c645aedd952c2071e93883378ca57f1fbc5786d4b5f3351d0"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 197792, 384, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "b00ea14835937bf34fe30c01e7eb2abcbc9366c79cba6859955fe298287f8fcf"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext", 197696, 384, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "e783c3abab7944891e2161be47f3430a0a94ab7db09f5966cb1a96b74797c769"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 197792, 384, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "50594f59ec2e4c7c00fcac0bf128559ab95d1380875cabc26a924bb78509802c"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext", 197696, 384, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "08393f906beb50e1abf8dbae06b36113f35a46afe203a19825e95c93fd1a6283"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 197792, 384, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "9027df64c85bcef0acc2e75342724b1697ffc05c484c258b391b86030e85b556"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext", 197696, 384, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "9aabdcfe93867aebccfd3e01788357161009e953b1ae5658e76858b1a4653b05"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 197792, 384, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "cafe95cd5680970faab01fd2f1be2578ce813f18f2201b1851a69cb72f5b7376"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 197696, 384, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "022475ec2f438667c63a8dad02d234081e9ac33ae30e14622d0f9242464ac720"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 197792, 384, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "496dfba2d4b9c8a66b668200d4b8189eb7a3d02684adcbfab3411b5d9af45f69"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 197696, 384, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "d325d98c22a31e0d4552eeb7fc1918f05b94a875a0a43050866fec776c89b997"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "98fab2c21d54a75cc38492bf0eaa25064d7b36ddb455c847b4bf161881903c40"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "6d1a1ef7b89ccfffc362f7d2cf2714f93b5df962fab37dc90a3f3726723c7acf"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "ee68e4f92933c7a3f0b431eb7637f4075786dfb77f50f9f97e73afd7ad1d7a11"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "7b5d9180d0f03783e0f6e06fa051a510684b6423be33579d27a09144ee47b0d4"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "2c61111cbf0aa34197ff002b59e9c7b4532ab4e26a5a95fc92ee725c53285564"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "15e600f527f6b73d1c85fb7f45360f3fb5c7938f16911a0d50dbb7d64d5b446b"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "34f2cf5a672d74e758c81dca117e71a1fc3fc7cfe09e75b78bd9b459a7294f6f"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "aefb92198b6a754f083eda9dfd9c4e22ebcf1c8fe2601a2063c73ed1f20f732e"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "97551e7ac60628491cc5c722f3acc155da2828fcb10c75baffec31faf717ff4d"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "d23995e9fdad42ff90c1ec3bfd6eca6980d580bbcd23ce3ccbd24a076070d22b"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "90d0320e6265ff6e74c5cd77371c35a8305839b1f347b44583457db6b3b45709"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "53b672f76e70c5d2dc04de59ea0bff6e6e1a413f496d472dfb5cf6813800a19f"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "1e4e4d2e7cb2cbf3e000fb18999b993ae909b4bad4e68ceb7d39b71b53993293"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "9d78be211193a14a38c03a48b688c4a09fa2321880e861fd100b5889bd5e1140"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "e7fd524f7319e85fa49122c802351cbc1a945e1d33f6e95d7abc087b0aacb992"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "48950fedfb3b7209d23f3570ba471ba8b8fe483ccff31911f1fde6a94a092024"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "7bd5d3c4e44c907270e51cf981fa7985bddf21149ed8fb6fe3cc67557fb556b6"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "0d40c5aae4dd5e34d95db8abfeed56213b25c92a3d4329e64a79c8c952b83779"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 228328, 512, 2, 32, 3, 3, 128, 0, 3, true, false, false, false, false, "457fd72674677ec804c45e58a17b23af37f41ea0ff12c9b84318d780289711dc"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 164960, 512, 2, 32, 3, 3, 128, 0, 1, true, false, false, false, false, "a5e289f2822864a6eaf7665100e352f55a4e0774639411e1ab2a83dbbc00d4f7"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen", 197824, 512, 2, 32, 3, 3, 128, 1, 0, true, false, false, false, false, "9a08209488f5387fdf6b3207fb74ac76fd3aa98c21308f7ce34a4969fa4cefd9"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen", 164944, 512, 2, 32, 3, 3, 128, 0, 0, true, false, false, false, false, "14de1e74606db11be54dcbc7a4a8c0b1af700bfdc94987d25ff79fb7b639109c"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 228328, 512, 2, 64, 3, 3, 128, 0, 3, true, false, false, false, false, "630c13724eb6557597ebfc2c2df8bc7cc0e754fb3f624220e25f297144d0fe26"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 164960, 512, 2, 64, 3, 3, 128, 0, 1, true, false, false, false, false, "71ecc9c0a8cc63725aa35971fddfa4f7b019de673115b0398a88c514ab508bf0"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen", 197824, 512, 2, 64, 3, 3, 128, 1, 0, true, false, false, false, false, "eaa4ae198363953d69a21794a98eb65ccac700bdd27a77ae187789bd37c11365"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen", 164944, 512, 2, 64, 3, 3, 128, 0, 0, true, false, false, false, false, "47d06fb3da44c55356d36d7ffaf5274eab25d902051bbe4e6858198cd408fd51"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 183400, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "756842e32310b3c2265d43f2d8dd8b65385daad8d20e76b2a9bc77400250debc"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 174696, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "2be7ac300c1d79f00b8bc5dff55831c674d599f82e7ce822f6fe7f8e8e36612f"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 149600, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "ddbfc1ab95216bd40ee85748c147bb2f59e898e8895a0119289feb01cf90e068"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 140896, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "d3699a3f3bf609ba91575c5af9b96f46b0f35d026d5eedd5ad29f5c0b4f14f4a"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 153792, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "21a36164502f71ceacaad01a7fed899cac653eaf55181497880f7717dbd7a607"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 149600, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "f2558adc88c76698b16934bdb91ff87486c13fceba0cfcba4c4ff7380d6cc2dd"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 143040, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "e5d6f0663bfb6568600995567627c6ae753e32556d7d2e148e334b0635e2acee"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 140896, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "b18864805ba0e8eca0a196d651f3c176b02161bef58c1f4854720e221bb6cf73"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 183400, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "1b249aebee583ade3e98a276645e2d2d8f356f9e9eef5edcf3c43e99fd17e140"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 174696, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "24f5ae34d1de8d0b1858e9b202b0c227034cd62d1fa026df637d596a8e49214d"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 149600, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "361325f45f40f1470a788cf1dce4a014dbaad39ee5407f7f3fd1af9bcd9e298a"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 140896, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "30835e85d3a8a2b438c90414905ccfbb78cf2cbc95b942e683e4d3b631f085e7"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 153792, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "8c46c404a73582ec1d89704f1ad942b3da0fa090d136074c756273a8d8dc75be"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 149600, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "64e870b36b5b0f3ba4c8745cc05a1eccacc54ef70ae6253396572473f168716b"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 143040, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "c62c25d21c1adddab217fc0a0d9fbd62310211fd161d7827061bcb12c0769488"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 140896, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "4ce7c905f0d5267fbd256dd5b8a6355d3ca76f6a7b60f0dd94accd56cf8b44c6"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 183400, 512, 2, 32, 2, 2, 16, 0, 3, true, false, false, false, false, "71267f1c2bf229ed3ec3f0c95ae02bb8220477032daf77564ee17969505484d4"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 174696, 512, 2, 32, 2, 2, 8, 0, 3, true, false, false, false, false, "34d28cf86de04141200d4a1f246168b9be2178bfa05c781273960c9d6a0f363e"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 149600, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false, false, false, "0edafb25c7f9a3f62b24de39d10c6eae3dd8e6d837bc887fe3b9f6c330291f30"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 140896, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false, false, false, "c0ed90493b7a2b273b0e87ed4cfc19046acdc24533babe0211b7c054c3b289de"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 153792, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false, false, false, "be09b1c501057fec80eedf51b3d87b2ff6531a14fff34e0f3d723c49128546ac"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 149600, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false, false, false, "247d0cb4119942684afda0e4203e2a1fb0c9be6625d775f586799b62ee742aa1"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 143040, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false, false, false, "1da50cb1f8e3b16609be48df759e045c0100c3f3da4990c7f0b9b1643cd4c3bb"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 140896, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false, false, false, "59623b0c47a2e8962fe2bb94fa611ae50d02ad34572154596f25a13818d5cbc0"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 183400, 512, 2, 64, 2, 2, 16, 0, 3, true, false, false, false, false, "5a79e38c2e585d3ba9e9cf900f9e58d52a552d44290fac75de6b87a7f70d5da4"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 174696, 512, 2, 64, 2, 2, 8, 0, 3, true, false, false, false, false, "b731302c24a3666dba67a9075720e289169735bbb6466f352199b93a4e306bee"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 149600, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false, false, false, "3b609896609f056feb9d7327a7a469f7c583301bc4ed20dc46ddde101f145c75"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 140896, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false, false, false, "3adc2e52f878532cdfeeb000d703f61361361f8524e28aebec2aa17a7e8dfa01"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 153792, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false, false, false, "4b4bc67c6d463e5b368f253f21286f3d45305b702b5c29f5d1f374e3fe4ad991"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 149600, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false, false, false, "fb8855f2d36407e75195b47c6a973becea6f3ec95b2fb4191cae41915f95946d"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 143040, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false, false, false, "b74a6335dc835528d8c8c2832c41e0fa0a631d2bc5a483226f465e82c59e403a"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 140896, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false, false, false, "2b687af13ac85fbef6c26bef425a214fd0b554df82ff00c2c4154c1d9fb6c9ba"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 191080, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "cbe37f7e147438539f1e978f4d978b7ff157af563a829b0198c7c979f7687bec"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 178280, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "f5b7f3bbdf972cfe74c95bc64cc98adb6dbd23d0a0ea9daab98eaa3df3c84f89"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 157792, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "50fbd7f64d6745686a273f576424ffc9a3efa8b5b9bed1951bc0f53e5dfdad3f"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 144992, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "f93ee8e710cfb582865690867c2691c96572834c60ba55e24e42ad9e98b365a4"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 161984, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "65a13b8c08996eac65a91aff0bcd0ac4ec27eeee713c9ad3551d6e023112c361"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 157792, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "2c86eda2150b248c86c20321218d4d30cb2460f57ff6a51752418d065b12de43"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 147136, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "0a6a3414c212eac65cdc77fbb424b9850cc5d274971bb1ba2cca45a2a6f28656"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 144992, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "3c33b2d4b2d38321a7492a2c690fba5fa73e3df6ba9c06cdf3ff4e0f2bd1d951"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 191080, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "1876e9d28235b266ee58627462b88040efab4fb6dedcf74b7f57df23b550a498"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 178280, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "147cb598211a5d4fb90d7a1b88f5415989e09432be2bbfba220183d5e88540d4"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 157792, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "80d37bc939a1160044e1a583a9131566c2ae2389d63cf49f2e9d6809804d3c89"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 144992, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "c865786fea6141651a650fd3e815d1200ef1b2a800ec06686461f1dbee40cb82"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 161984, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "2a5e59f9947d5e73c24bacc04f8f02286de51954fd4f4c051da2711694ad1c56"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 157792, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "0071a698d3b8fb379821efe7c672607002e47d5b2d33e4a39d286b61b8ebf1bd"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 147136, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "e46c7bfd0a8aae5900869125c85898fed6cc8e66724482d14d2186ba8ed01208"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 144992, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "046481c7a549a5476b27c26c80491990afdbba7cddaf24ab213b51d2ec6b06a6"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 191080, 512, 2, 32, 2, 2, 16, 0, 3, true, false, false, false, false, "9b45d1bf5cafccb729ac8a0e9ccc0442f4b998c7c66f03694f1f554cb70d13f8"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 178280, 512, 2, 32, 2, 2, 8, 0, 3, true, false, false, false, false, "36f3986de1a0f05c6947ef03c077bbb84d6b76d7d3bd6711e4c2b1b00cd4e1c7"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 157792, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false, false, false, "e3278691104c152379acc04915cc2ca343de07b8b89646784fa37f878c8bb23f"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 144992, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false, false, false, "dbf975f77e591d49804b549ab73ef71faab1fa0b6764588b10a6fdf33aeaad07"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 161984, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false, false, false, "e5d55518ea22ae8f99fd25a8bf914d219af44d3377018a650b4e25bb440416b0"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 157792, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false, false, false, "5dff76a248850f21926fbbc4743ea97b117cc28c024a11eb597dfd89deacf09f"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 147136, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false, false, false, "b35e2c4e28b17bfc776f3e9b1acecc78ac2dd292c0cdc62465bce3af2fe8827e"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 144992, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false, false, false, "cee1a9f7450d0b1223914549a33fcea0051f696b8a67246389462419e3b0f891"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 191080, 512, 2, 64, 2, 2, 16, 0, 3, true, false, false, false, false, "ef5fed8c784f9e0b1c7e33cf1deedafc3fa2d2802063d817f4928a8c0665d0c8"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 178280, 512, 2, 64, 2, 2, 8, 0, 3, true, false, false, false, false, "f8b017983fcc8b8b083642efce6ccf303f9fa9aa7a456a3fc9c9d115c9e149b9"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 157792, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false, false, false, "8e3bda5a1513219fd5803653c8fc983ee2972fbca557f2dff02f436959b531dc"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 144992, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false, false, false, "bbbb711ba069dfd140fff4b8c04473a581e2d1e4f17694cb76d881ef7257187c"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 161984, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false, false, false, "939aa017d0c091cc410c0f88b3d2b838b73d930abb4128a77f356bef2f52cdb1"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 157792, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false, false, false, "03ca199656a7070d37475fdcb00a934e55352822a725f327ef9b7fd10aa66ae8"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 147136, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false, false, false, "bb0780e1bc71ef95f4acca0b0aff0fe121ae4bdcd29e4620734d08830fd6ee21"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 144992, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false, false, false, "21ede5e61705fb15a366dde9d277dc746ef61564117464f6cfd44d4839e12f13"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 199864, 512, 2, 32, 3, 3, 128, 0, 3, true, false, false, false, false, "807203c79853e0abbf51d102dd63401130dade11343cd7c664032e3e3b17fed5"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 165040, 512, 2, 32, 3, 3, 128, 0, 1, true, false, false, false, false, "130f5ff9fa30195fa9021b573a2e5da8907e7f7c5a94a45c75e35e764888e5da"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen", 181520, 512, 2, 32, 3, 3, 128, 1, 0, true, false, false, false, false, "26f2c18fd3dfeed595dd9440690913cb33c85f0c2da1d69f6983c64bcda00698"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen", 165024, 512, 2, 32, 3, 3, 128, 0, 0, true, false, false, false, false, "b0c53dae2bb8e76e9096f8d397926c02aa30c9baf1fa4c5cf2f33c735d2b43ad"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 199864, 512, 2, 64, 3, 3, 128, 0, 3, true, false, false, false, false, "8e95a5dd8c2ab0721163ef513cd96ee665dcb468d353ba2ccd4950c70296d1ee"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 165040, 512, 2, 64, 3, 3, 128, 0, 1, true, false, false, false, false, "6b8cbb5559b1bca1a3041ea70dd569cec4553efe04697a92fcdbd869280e8760"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen", 181520, 512, 2, 64, 3, 3, 128, 1, 0, true, false, false, false, false, "794d7a98027f6cdc5806c764ef66ee64442a6e9c323d65b6b2b07e97c9efba1f"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen", 165024, 512, 2, 64, 3, 3, 128, 0, 0, true, false, false, false, false, "80aba4f3aec5ec7c63c98d191ab97484f5be70a75ea2c2aa3e9753bff76b310a"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 196792, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "c1bc85572ca504d480d30b9209817e16a8fca97a276e1ac33b79c2c3bdc23af8"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 190136, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "2bdd41ad578b530e1def5e00b5e414e7615bc8afb76afee38b8ff3f1757b6534"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "d313c075bfdf92c6d6bb61157a27c3290b7eb0299fc3ed10237d551ed841bcfe"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "4202e7d752a0a776eecb416e622350ef7eab767d0c4e2cac8cd25c515300775a"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 164112, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "0b9f53b680c7c08c0c74e5c04522f612d3a74a316f4b83835511c16346896156"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "c059149a81deb0ad820e6ed77f98dea5d87860dc3365609d2d6675fe45d4fa2e"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 156432, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "28e3e20a1ca8cf2865d92c8c3ae7d3c460d0d3819bc04596785c0c78578fe37d"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "4e00049e587c58e146b9048b92e98595b2c0e4a07b0f0f21a609b5d393329362"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 196792, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "01b0c06fe4ae37cf2928d5abf81f6bac5a467a70340bfa5ebe924a040b9d53f6"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 190136, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "b3471b43a9b0a822b64cda739671156880ae39f3225c0fc886a0ad43712b4929"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "2b0cf317efb1ce89239d8db078b15b68587821c0529d2a235571c4299578e010"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "fe57e17ef4a9db3918baa72a9df2b0113c92214742c5458d9cbe534b9bafdcea"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 164112, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "1fe5f4d800b23a9c4fea07fc8017e0acc3d37bd45035f48a2fcddefa206f66c1"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "b17b5904947449006c2455b675a3da85cf294be1d03006342ce2f0ea8d99199a"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 156432, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "79f5206bc620abc894bd53c2bb732aa90f6ef7df57edc0f4b5f7687baf7835a9"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "c77e9870d85d8d4f8709246e66f36f6b18bd962d976c6feef5caec1f1ae6740b"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 196792, 512, 2, 32, 2, 2, 16, 0, 3, true, false, false, false, false, "db9a2c00122cd18c8db12f1f35f4f087421757efa695197caf782b06f9ed7c52"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 190136, 512, 2, 32, 2, 2, 8, 0, 3, true, false, false, false, false, "4f27ea4235d352e3ca4d6abae57b89072356b362a5aff5ad24804898ae45ec0b"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false, false, false, "21fce85bf18719f3212d3d1c3cfc815b78aff385c85b36d90db6c6bca2b877c9"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false, false, false, "aa39cf0ce7d69baee094715b155d983f1cee5f2227f6420efad249a66950b732"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 164112, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false, false, false, "4bc2d3e8949595624d7f6f484160c5121433ea2e0f20684691e6b8d596b600af"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false, false, false, "eee3f680b598e4856ab2dd273636543f514bffd28116431890f5244530e0b680"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 156432, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false, false, false, "5d77929f3659d6fa7ee9f65346fac6340b3b44dd0431800ff305ceab1ce77b78"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false, false, false, "16f705e299d162d0ff12dd21180e173d50185bdadc9c1b18f4e8311b084f5ed9"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 196792, 512, 2, 64, 2, 2, 16, 0, 3, true, false, false, false, false, "2b62e5a8261542eeb06af1475a914e670c5dbd62b481e3fb71283a476702c327"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 190136, 512, 2, 64, 2, 2, 8, 0, 3, true, false, false, false, false, "ede77ae1bf1ee5a1df65fe6bbdd53d37e730716e102554b4ca902f0616ce6fce"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false, false, false, "1b2baa6ac764b32a0a213754fa502d7c609eed94cdd79bf6f3d7a5c9fb67ded3"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false, false, false, "8e43a1668a79a023bcdc24c79a17a5668cd0851c58c63521a1907ea6b4b08e27"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 164112, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false, false, false, "1171d9c7e20c1f9bfe268b9c697a065e4e0dfb131d38ccc0bbe77f3e7e8a489e"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false, false, false, "1338ed9ddc2ae62c9b8e5af582fc635e8aac6cddd9ebe2e564ca6ef0db8a7aef"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 156432, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false, false, false, "818be6e569751426dbc2e2143d8a73a096c0bc4302a12720f03ed0ba005d529c"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false, false, false, "1316928dff4f0027f8db5387873b2a6d7be54c1a40be9884494b1b6d7d116bbf"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 212072, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "8bebdbe303ad6e43f70bba7c66bd13fcb2bab60124fd12259a27bd04837b6808"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen", 224440, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "a75eb4a18dd69bd63f70e7b067d11650a9900d57fbbd8d78d3f5fddb68d762d6"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen", 205920, 384, 2, 32, 0, 3, 64, 0, 3, true, false, false, false, false, "dea350a87a2a8f247692d250d934532a1d09dd1750620a34336738df89f1815b"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 189032, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "e21f898b5654a6710ce6fb693a5bde47255f625ffc4ba385b2f2ae47ef5a0ffd"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen", 203448, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "f94a8f3ac31a3b8315bebebd9a01daa04efa58dd8fb999a0135f9b08690d0658"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen", 205904, 384, 2, 32, 0, 3, 64, 0, 2, true, false, false, false, false, "c820ab53e328d9a1bead97ea0d62a83d83df22bf34a3152251ad543ea826ae5b"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 178272, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "753b73114972e4b96166cd46e2410489baf6b1fe1bc796f192154298f92d2646"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen", 190640, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "f4f1ff9f51cd2b6559c50356e212a1b002d54f8e3fafaf5ca358a2b7ec1fb584"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155232, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "7ae7d0ce89b57a04add9ab5e5a25ea2e204916d8deac25e6bf69db717274d380"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen", 169648, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "127bf0a765369b103d3fbda374c94500777984204c19d3237adc5e7024179391"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 182464, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "382d98836bf06b9d8749fcdd95814f241c9ea600e95ba17c403fc42a735c69e1"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 178272, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "f1edf7a596808f28a8f0135e7bd0631c297c4453549eda2546d02a6f2ee70466"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen", 194832, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "19e1e67e6b75c7fe81e181478b3bacc72775ae3a7a6aade3cafccb2e549f9e4c"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen", 190640, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "3024d274f1d34997cdec83caeb3a67a9d7b7b409c1056f5a4fdb6888c0b353b4"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen", 205984, 384, 2, 32, 0, 3, 64, 1, 0, true, false, false, false, false, "277431b07e264b08c91cc49c03a559309a30d448059da14e0cfeaf06a470b4e7"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen", 205888, 384, 2, 32, 0, 3, 64, 0, 0, true, false, false, false, false, "d2c7326be4e9c14707d25f294d0b761fc5ec63acbaadff53459bad8b6d4c63e7"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 157376, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "e9f596f45360085bb7b5daa8263fd00f1eea30b573b9198ef67ff01733757d10"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 155232, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "2b6500b62cbeee9ed899efd411c676786fbdac9e1ec2d097ae4444e4bbbaa455"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen", 171792, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "9645ee469dadbc6266ce66afb3ed50a098f691fb6ff766c0f39f49422b59597f"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen", 169648, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "05b50f88e583146eaaeccfce79bc03b7bebc704311946c5eb5ba7737cec655e2"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 212072, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "759cfc74d3ecdc97bca6e27999c4c9df424322c6673cbaacd01eeb45f2f58759"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen", 224440, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "60bf85feb725e52ac1325eb35071d48a1a5c86a5656070716c2cd9511c4ed869"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen", 205920, 384, 2, 64, 0, 3, 64, 0, 3, true, false, false, false, false, "b9a75b7db734128a4260bb2cd017b787a08ce3e339d6a9612401549231842586"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 189032, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "3e79b1e7f0e7dddad01116c5dc0009ae2d4f5aa87b5c9e8c5a1015b5b3c9bb51"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen", 203448, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "f52d367b59b9bfd0723341f01bfb540c03c7b4dc5d494b72cbb1d0c969f66462"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen", 205904, 384, 2, 64, 0, 3, 64, 0, 2, true, false, false, false, false, "6864110c9e43761861b7c4f28eab883b45e7d6450fbd407c953eca0a92fa6793"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 178272, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "683e09776e1bf6c820c7db004dc833ddf1d3835247ce11ad1920b82a4978b66d"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen", 190640, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "88e8fd790e18f77755ce51f1ce59d8d7f63a9f07814bae85fbefd4bd34e95546"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155232, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "1efbe2d7635606b844eaab08ba19e61190fa3271c0ed77e2d2787841cdc52dbd"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen", 169648, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "dbc8e45d8e7d9974f6696f1afb77a043af4cd134e0ad5682ec89e6bb47abaa33"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 182464, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "e49aaecf723700c1b15d9d679fe9bceac927805691635922f4ea407b209b6ad9"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 178272, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "47af3254e4ddf6db7f42694c8bbbffed6998659e87527460f9646c714d4bd9ed"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen", 194832, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "a0fd296f7ea7e743d8b090371dbbe9cf28a00614bd6d0d1607b4ecb80cd3148a"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen", 190640, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "a5bf0563631360660e370f06dfcab8b7b33479863ff76e8350af3feca99f3287"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen", 205984, 384, 2, 64, 0, 3, 64, 1, 0, true, false, false, false, false, "697f926069c27adc669b429d0024a0e779a81de4f79aefc7b50a780a9dd2d144"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen", 205888, 384, 2, 64, 0, 3, 64, 0, 0, true, false, false, false, false, "f0cdb4c71b6a1c0d9b84fdd419d7df3f51ec997f1d7428237d4a43a6c1847826"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 157376, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "c81405bda5832155d838db2200b1dca3125048f85969e849d9711caeee3a1c62"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 155232, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "c43b7523b32d781ccc40c7f43496a11dcdc200ca506017635ffa95ce440b4e0a"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen", 171792, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "175c2bc4c52519a19f97a04fd549a15d72dd498e1974f2bc0b6a733019331b2e"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen", 169648, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "cfc315b2dce06e4530fe33cb03e4243cd4721118139a247d0000a58e8e9bebf9"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 198728, 512, 2, 1, 0, 2, 16, 0, 3, true, false, false, true, false, "0660c14b7eb4839b113628e6156477845c783b87b15ce5d27e4ff878a4f2ebf0"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 185032, 512, 2, 1, 0, 2, 8, 0, 3, true, false, false, true, false, "d1eb6faa2a4558d582cf980c62b424164c85cb9408fc4a6efaf95fb913016714"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 164928, 512, 2, 1, 0, 2, 16, 0, 1, true, false, false, true, false, "a05a44375fe8a29fc5d9dbccb7677ba9fbe0efe5531351d10e049ab5c7ebc2fe"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 151232, 512, 2, 1, 0, 2, 8, 0, 1, true, false, false, true, false, "afa9367c3268b444c4fd2ef89d18202027d8c6a26d76e7cc54cbd5e6cff035f6"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen", 169120, 512, 2, 1, 0, 2, 16, 1, 0, true, false, false, true, false, "e8a801376716fcf6aea3383403397a26e40006201d1f24bfb958ebcb89662b15"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen", 164928, 512, 2, 1, 0, 2, 16, 0, 0, true, false, false, true, false, "2d178922a19dc479a4bb91aac064aa7b565fb9efc93f038a35491ac7a29d0da2"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen", 153376, 512, 2, 1, 0, 2, 8, 1, 0, true, false, false, true, false, "094dac4d344f93dd59572d147199e12cd7cb72889cc85eaf6563d8db3ba60f3b"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen", 151232, 512, 2, 1, 0, 2, 8, 0, 0, true, false, false, true, false, "4ba1e3b60cbeb39611c0691910db32a65018e2bf13060dcbeff223e955b195b7"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 211560, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "46a0a5da461607999bd030a4cc6a019b8fe5e59f8edd21c7a066c1d556cb21b6"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen", 223928, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "5969c0b9e6aedaa2f5adc9883b4fd58e543f315b48ebbc8f379d41b86b1b8252"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen", 223896, 384, 2, 32, 0, 3, 64, 0, 3, true, false, true, false, false, "6bde444a74abe45a64668e16d9fdaad5212098a78b31ee70719c1382a10f5c2b"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen", 205920, 384, 2, 32, 0, 3, 64, 0, 3, true, false, false, false, false, "6720be0df89e1fbcd637593a1cffbc8eecc935e6b958719d9b5309f3dc0420ea"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 188520, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "692ae2c0532556c70705af9583ec3a88d11415de8606580e8751d261c56d0963"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen", 202936, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "77a609e53aed84cc1c01fb1037160e2261f4b1f4b251b24c2a1139778d9ce9ba"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen", 223880, 384, 2, 32, 0, 3, 64, 0, 2, true, false, true, false, false, "8e61e9763737db6abb88eb1ddc9aff5393637d0cf2b184af65d8154545c7bd74"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen", 205904, 384, 2, 32, 0, 3, 64, 0, 2, true, false, false, false, false, "04434f7f849f293e609d15f8efe56e9daf2811661ddc7b2234fb3b69d44bb6a9"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 178272, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "a6279bd2cca629e25eb975082b9317c4f48b38238ea57a73186bddb709ca8452"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen", 190640, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "a02655dab8bfbdca6565bf0d84b793bafd23cc2582f764c3fe57d4e95a685479"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155232, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "085631508091f8b5d0527af168a369bd23715ed514e9953f2cdb72363786bd2e"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen", 169648, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "cf335ebc091ea120891c1f3e08d6ecb003c87356e608eada590678ac561df3f1"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 182464, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "0e18203e03c38b429b9189cf2c6076469b40c74d2675f0ab7cedbbb78c905313"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 178272, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "a0ae3256b54ce9b7c60136d0beaa1ee0ccef3905cb388db2ed9e28f7a17cd22d"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen", 194832, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "8374df8cb128bc5d010e0e43a70af22492dc329b10d62b9c1c137e3f0821236f"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen", 190640, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "f7cf73f659983af72dbea61eefbd040ef153bf9bb0679a3d1ac33518e0ce2917"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Persistent2CtaKeepsAbForGen", 223960, 384, 2, 32, 0, 3, 64, 1, 0, true, false, true, false, false, "70bb2f57cd2efd3338fbbdb17610f1a7dee066dca38571bf604564962c066f17"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen", 205984, 384, 2, 32, 0, 3, 64, 1, 0, true, false, false, false, false, "44002984b3487826e2ba35aa91ae025e3c3299facc047985dacf4fa1f0ccc919"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Static2CtaKeepsAbForGen", 223864, 384, 2, 32, 0, 3, 64, 0, 0, true, false, true, false, false, "4c067c4e9ad758dc51a10b90c7c210e0d06a31f8b516a56dede9034d1e354ffe"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen", 205888, 384, 2, 32, 0, 3, 64, 0, 0, true, false, false, false, false, "b95054343963c703ed6055f16d9f3c39c015fb68f9f5ca57fb55d2c5f2eabb67"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 157376, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "89b11cfff7cb6cf109caf757925b0d9d1488090d239c1b9ea610ae6a298bc0a3"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 155232, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "443db7c4772cef999bd0b34e97b9f9cb7dbd10f64b2c40b0b4af1e9ca7dc5e76"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen", 171792, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "3e0edfad24226ed4a1b876b4a49e14a996769e5e63804943c4048736b40a17d9"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen", 169648, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "3a0f6a219ba812c0b33338e7abd247154e3c2e16657aabb6ea247414e54cbbc0"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 211560, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "35643782a477611634024766061f1c34b481886d12b872af502c4df11447fc6e"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen", 223928, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "27e723edeb2c5b3d2ca782944720346e42766b36a3856f6594d79ffb606c676c"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen", 223896, 384, 2, 64, 0, 3, 64, 0, 3, true, false, true, false, false, "9987096f28383af2b6058994ce2d5058d8bb1e11b2f7bc2ef4434e70ce9d35d8"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen", 205920, 384, 2, 64, 0, 3, 64, 0, 3, true, false, false, false, false, "953419175ddc9e5aa8af6cc18e69519d229cf59c9cdcebd99cb6d7ec5d2742c9"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 188520, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "fb48b5adbb371b7ef0790a3b15be815888a59f0f10a68db0bfae871967201f9e"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen", 202936, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "cb47cb4763daa7e309c1fbc3f5b58191408b61519b6e3d4010205b2af2df0910"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen", 223880, 384, 2, 64, 0, 3, 64, 0, 2, true, false, true, false, false, "0a659ad765284cbbb7e1026308446677e815959974c2eaf610d22c5838d8c139"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen", 205904, 384, 2, 64, 0, 3, 64, 0, 2, true, false, false, false, false, "d09e3a1fc09e2ff81f252bfe543ddd50c6993022d0793f80a6d5b7bd2da9c0ed"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 178272, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "50d737d582663a4cd060396ada173ff1a09350f94b7ede5ebcd960b89b65128e"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen", 190640, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "427f3a74026e9f3efb9c99267bf3de0b9d740a9e7016732b4bb7a0607f5e9b87"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155232, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "8601bd3db445967017b396176b4764140b8fcbce6ac3a721e3ff869b96bc5971"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen", 169648, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "eaa544b3b66665cd162d399d11af7e3c89b5043520c8e523457396437263c456"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 182464, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "c81069e7f9e8845c4e8fa34dc5a3a8f4db8a25d05804d44ae71023a9aeb0394f"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 178272, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "beca22c309088755c664ee916182cbc0397492b679f2e39749c880e2ea2df844"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen", 194832, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "6b50bfa23a2d505c51a260adfcb46ffc8022aa13c6dcab115a7ed8e6be85c96d"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen", 190640, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "24c9fa8d9ef2235b6f76acf8e2a6d5cd7d13c4e4156c69b5ddc1aef45eefe42f"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Persistent2CtaKeepsAbForGen", 223960, 384, 2, 64, 0, 3, 64, 1, 0, true, false, true, false, false, "d9a4e9d8859139d4534c165cf2b54f10fb33b0040a03acfd02d9bd7ca964a3e3"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen", 205984, 384, 2, 64, 0, 3, 64, 1, 0, true, false, false, false, false, "e4401d53907090bf26cecd6dd43e4e086e15472eae12e413e54626f8b8543a72"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Static2CtaKeepsAbForGen", 223864, 384, 2, 64, 0, 3, 64, 0, 0, true, false, true, false, false, "324a6d1378151d904d5c536dec64aff45cc6e7364449456ed3acc4bc9957423d"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen", 205888, 384, 2, 64, 0, 3, 64, 0, 0, true, false, false, false, false, "c00cd78883d17b8f6955ef4a085448205434a1cee0a57bee5848e64849c18d8c"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 157376, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "316dae9bd4e97be1db9262a55848b4a7be51d75acea138a57654bddab6ee50bb"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 155232, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "15ad1d9099d2639e30058ef0547c5264d9740c72d457be3fe61c01ddbc466143"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen", 171792, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "4ca9e0e82f01db5904d4a4f42bb97e113b9f6f3e25fcc93670f4a235f05d8bc3"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen", 169648, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "941e50594eced172457b81b0558375f216ca8ed52e38d1d01a0277e8dd33f862"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 198216, 512, 2, 1, 0, 2, 16, 0, 3, true, false, false, true, false, "83dd62e7b139c646284e1f02822716ea668a652abfcf481520267270908038a6"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen", 229272, 512, 2, 1, 0, 3, 64, 0, 3, true, false, true, true, false, "99a07de3a573c0d13fa1c988831e89920cce98568dd3f8a9dda1cd44f531e28f"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 184520, 512, 2, 1, 0, 2, 8, 0, 3, true, false, false, true, false, "aeba8d8e70d7a5831af536e1ba6079403986e3604c75832491687fb246f4c91e"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen", 229256, 512, 2, 1, 0, 3, 64, 0, 2, true, false, true, true, false, "be210cb25bdbd46ce99555d444a4db88303ec8c388897287d68fd14813d035b0"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 164928, 512, 2, 1, 0, 2, 16, 0, 1, true, false, false, true, false, "ce52894d104591b83dc04e7dae747a4e241fb8312c67a1b1b21c55fe460e2caf"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 151232, 512, 2, 1, 0, 2, 8, 0, 1, true, false, false, true, false, "ef2c185f82183d1656be21e4ca66708a8915967f64d16941d5a8918b92d56c13"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen", 169120, 512, 2, 1, 0, 2, 16, 1, 0, true, false, false, true, false, "e43fe55166faa22face8ecf770c824229cdf76019eb32a8fdfddf6d66eea87ba"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen", 164928, 512, 2, 1, 0, 2, 16, 0, 0, true, false, false, true, false, "bb51be28d839c8f6886609c115114342dbeb3b4022cd540332c459267aa7905e"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Persistent2CtaKeepsAbForGen", 229336, 512, 2, 1, 0, 3, 64, 1, 0, true, false, true, true, false, "2738c1a8080fd9121c065a23921389020a7b416b392e6b85b7f37e8f22e9dd01"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Static2CtaKeepsAbForGen", 229240, 512, 2, 1, 0, 3, 64, 0, 0, true, false, true, true, false, "d37dd1bcc3c7b6bee7105c4ffb0bdfa1acfebdf9435f075bb439089d1e7d44f8"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen", 153376, 512, 2, 1, 0, 2, 8, 1, 0, true, false, false, true, false, "6beecb7b05df3647530e987a7fd28ba646face9a9e1481098fb2d628e384250a"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen", 151232, 512, 2, 1, 0, 2, 8, 0, 0, true, false, false, true, false, "fc41aedca2b234797e537a0df1b8b5f787002299c2b23e6fd3fd28c5be4ab204"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 211304, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "a0e3490a9d0a6bd551f8fa6ef1c3ffcfd3c769b59bb91167ad13faf0bc3d3688"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen", 223672, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "c0f47210035aaf2f9f8d0ce3d3d4e6d5feb3d8ce6ea272dd03d7b04ae28b764b"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen", 205920, 384, 2, 32, 0, 3, 64, 0, 3, true, false, false, false, false, "c21cf7e0a34bfa5099583eea10b6f8ab59e965d81e9d99c7940871acb97d27c4"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 188264, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "23fc31e8db085ebf0431d001f1f4035a3ce983ca1b71ef262e71d71b26f8968b"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen", 202680, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "a5d3b697e393f8c3d2077db6067946130a6abd8b538bdcdee65ce562b0f93242"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen", 205904, 384, 2, 32, 0, 3, 64, 0, 2, true, false, false, false, false, "c5ae1676d4a69ca94943abfef90fdbe1d703c690501367eac4f96f10ff326bd1"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 178272, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "d4801911f3b74eef1a279c2f1d7985fbe3d327a989844b403eecaa9076345a07"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen", 190640, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "431e6b546e4fe2b0735547ff7f3faf8c04777001cdf75c87a609f7c72f2048f8"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155232, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "4ee79ea8129306f5c5429704cea0753c3f62bee967f3028d946294fdfbd52e68"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen", 169648, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "f0fc31fdbcea8bf73b3b011597d3d5d1bd2dce8b0f30b2d4ca52d9cd64d0a5da"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 182464, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "4b79660567c5e062670031e5750830a832984d59159cb6faf16c94e926fa5996"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 178272, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "835bbddd7cbb73e69d1dc384fa1ad70b5961c6e93caf570cc34bd7a71462cad9"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen", 194832, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "f24583e68b39666888cf5d49923dc7c850c867991171ed327ad04979eb3b2b6c"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen", 190640, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "560ea49625ab161e9005ddf0075ca2cd863273c7a8dd48a79f0ae0629cae091b"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen", 205984, 384, 2, 32, 0, 3, 64, 1, 0, true, false, false, false, false, "d6d2a0ee240a229f3f0515ac66489ebf162d03c80bdd2035e8223e28e00910c3"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen", 205888, 384, 2, 32, 0, 3, 64, 0, 0, true, false, false, false, false, "b0d5a1bbacb2a601b3e5760a158897d41f066aca674d6a68a1a1d1bada1fa3dc"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 157376, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "ee74dfd3d91aecd13089ec0e15445d6d198fbe54e627189d90adb04a4180fa9d"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 155232, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "bc3b77c5fd1b71897efb09b25f4a8a7ae96d12a7b1f9f9fdc2eafe7cff900694"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen", 171792, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "40c36f214d8558ac8a6e4a67f87e8ce2dca7fc6f1323fcf9527dc68cc931a358"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen", 169648, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "833e8d90357dcbccfaee14a719e76079cb472bede51d01a2ee6b31b29734231c"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 211304, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "5bb472ceeaac3fb05dd8d40fab23a7da83984b8e77dc16c691f0a124bf4d90d4"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen", 223672, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "6416bb725afac4353f57214bc2c49edd76636dafb87638f18a3c378097cd7a02"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen", 205920, 384, 2, 64, 0, 3, 64, 0, 3, true, false, false, false, false, "0164d6aa8faa8247653cfdf5d0352580f68b3268c111387ccaea2b8e789896b8"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 188264, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "fd206d12a115ce9666635d9b0193b05a4b4183a641378b2b37ca1f08b40fb872"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen", 202680, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "7874b9e358e9df71bc046aac4c514bcbdeb80707dff0d51a779a6508b9c187f6"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen", 205904, 384, 2, 64, 0, 3, 64, 0, 2, true, false, false, false, false, "b0a7a2e61d2f1a7e75a679b96a230fbb6d736bbbb450037601e477da80b5f5d5"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 178272, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "a8ec2195ba70f8c4739fe776920dd0289e748b6571f7b972dcaf801e1e5e0ae0"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen", 190640, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "3194491b6b9804c70e2f6af262391325c646b7bc92c48b46e321baf704e05fae"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155232, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "8578d675f92487b9db8de6248f5b4c69106e10d5f52864bb260de44e942c9caf"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen", 169648, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "716d5e10145a4044ff8060c5d1fa550d132326314bb741ca7df3c428ace72ddc"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 182464, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "58fd6b496afb9954fef093faf991e193385dc54d16e93dcb3bd95547c35677c7"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 178272, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "4dad32e85fbe4fe484d8b8a7e43cf93b26f189170b50165f31a60f0c280eee0f"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen", 194832, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "d281b2f87d360aae0264cc28ac2c783d8c4860af96f0a18d32cdf83be9e5a900"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen", 190640, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "59a3bbbfab9493494a7e27dcacbf5301b6992b0e52579ad8a1222f3ff6060955"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen", 205984, 384, 2, 64, 0, 3, 64, 1, 0, true, false, false, false, false, "95c2999829134b5924d75da6fb136b7c3adeefdb74e168f177f78e758ec86102"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen", 205888, 384, 2, 64, 0, 3, 64, 0, 0, true, false, false, false, false, "76e78c6a16aa11eaf1c23d4e0e3d65d8b97ed8a80bd37bdf895fafcc9267c864"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 157376, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "422e5ba78c04a678a56a6bb25dafb1c6c989948d01a3c20a5ed3ad8e6a646dbb"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 155232, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "3273c932519be7b44f1b951f90b1e8ffea147b3bc90ffe6f8efb5f95ef5f5d7b"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen", 171792, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "5b3f56077ad3f52f503d795f716a062e9f3ca0fb7fc5cac1fdf871be7b4803db"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen", 169648, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "bfb2cf43114fbf6ed166d5c4fbc945c34fb42aa1c2fc6aef6aa58ff402287ccb"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 197960, 512, 2, 1, 0, 2, 16, 0, 3, true, false, false, true, false, "92ba3838c0e80d405dcfbbf8cc076a53a5b0245e66db591a872041f90b3486a4"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 184264, 512, 2, 1, 0, 2, 8, 0, 3, true, false, false, true, false, "6da3b8ba30f8346857021a2de778c7c50fbb6fab34bc2924e7e951943891370b"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 164928, 512, 2, 1, 0, 2, 16, 0, 1, true, false, false, true, false, "9eb3b9c1aa4c3e0b0f02a1df92f9717ffe81f50d325e7a07c8d87175c3ae684d"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 151232, 512, 2, 1, 0, 2, 8, 0, 1, true, false, false, true, false, "3fe70b95ca042f4f1ef60682dd21d07838e0b3610d369a70c344934eda8c41e5"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen", 169120, 512, 2, 1, 0, 2, 16, 1, 0, true, false, false, true, false, "71b19fe60b9dfbf3dfecee6aa3f3cea6a54baba809cc891395b8b79afa59b03a"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen", 164928, 512, 2, 1, 0, 2, 16, 0, 0, true, false, false, true, false, "e8c0bdc87e9059464093a0406b84347fe096c807a12edd2a99b5bef98e10da05"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen", 153376, 512, 2, 1, 0, 2, 8, 1, 0, true, false, false, true, false, "016480abe9738869a550c735102a7ab1a30c309e2fb779650aaddaaefd74a52d"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen", 151232, 512, 2, 1, 0, 2, 8, 0, 0, true, false, false, true, false, "5dcfdd1ff324b3222793a9f7a9208f19c5f7bc9707fc625536b76b17629f129d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 228408, 512, 2, 32, 3, 3, 128, 0, 3, true, false, false, false, false, "c9a67f62184382a97288f8c859b560cc23153320352146d185a72efea4f49de1"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 165040, 512, 2, 32, 3, 3, 128, 0, 1, true, false, false, false, false, "c616ba60658581eb371472cc5186e442f48142d071248198dfa632fd62a5eda6"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen", 181520, 512, 2, 32, 3, 3, 128, 1, 0, true, false, false, false, false, "1511e8e9bb6d33c3d1579f3781e9fcd79daa6531c6fb9e1fd9a136311b6a4a56"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen", 165024, 512, 2, 32, 3, 3, 128, 0, 0, true, false, false, false, false, "7df38f14dca49f547f01c238344ccca8a98f24c8279df0201a47e44e8192573c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 228408, 512, 2, 64, 3, 3, 128, 0, 3, true, false, false, false, false, "854acdcae4b3c0d651f2af71c84eae63be1a44af9bc92fff0e67fc62b17936f2"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 165040, 512, 2, 64, 3, 3, 128, 0, 1, true, false, false, false, false, "fc7a8c3171ba1b5c31de122f31738b8f4a50cb8aae63364a838154486c34137e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen", 181520, 512, 2, 64, 3, 3, 128, 1, 0, true, false, false, false, false, "dc49e5ac658cf0642e044722309397981f31627ad22368e307848a12fcf60034"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen", 165024, 512, 2, 64, 3, 3, 128, 0, 0, true, false, false, false, false, "a0accf802c14cc8b17e88670b64e54a3142c768e6b0a6bdf15eb7dd831cebe03"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 191672, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "bf77e551e83a4e2941765d645a1d3913a53732e2cc0e1e881fdb4d8e702e114b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187064, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "0a6ec500f0a2ccbacd2fef1c396fbc54ba79059a92936efbaf33f9abb4ee0958"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "007932793b1d5b6adf62dd896e52ae320f2b3e0b74c3a1aed67e1265d36f00df"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "56b565ce6d1521cb87b7edc587489064a4a03b6f62d38543cdb6600666dbdb32"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 162064, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "dc26fb3f030ce3552eb2af74d15866cc2893699d4344c1ccd143ac066b0620e0"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "8087818a17c47131ffbea2123ae5a8fc14053aa34cf788ab3c6f24d70d85fef0"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 155408, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "78f537bcf7fb0d374ced486ceacceeb399bf2181f0df2b7a196e0aaff4f35501"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "7f78e318061c8b783177e9d23a9cfb9a0ab84fc8751f6ccc4625510cc08ae9be"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 191672, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "fb6c64d85e946c3f0142bcc7a977f2ca8cba54ad72d63bc56d75f1577948465f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187064, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "455969b9b20c44bcdfc601aa32d4aa2bad6c65d6004470edd2bc6889568fcec3"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "7af88e1c5982a79e1477bb14e3ac55b858dfc7f1962355b3402a5b7632843632"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "64255add7c035e541b415f13cc072b23b14531fad967914fd602d68e03da460d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 162064, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "a3207443c61baaef52406cf85ed3c9896dea3e3212cd625511e8bed6d3d92f55"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "4349ace32a6e99f7638442c2cbc642aa740b7bc4599876e1cef74ba7c356ae8c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 155408, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "6c456c541b9b8bc0231c509614123b447aef76eab60e2933ca63498225ae0751"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "d25d3e0cbc6d9e9a323085b5d238b095bac5bb2f390a1b912a8ceb49de30c330"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 191672, 512, 2, 32, 2, 2, 16, 0, 3, true, false, false, false, false, "e458b191d6e2496553334ebe43cdacde01dc95d8f1f13478c208901bcaa0834e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187064, 512, 2, 32, 2, 2, 8, 0, 3, true, false, false, false, false, "c6b797c19ad92439cd1a4c8d3d844cd389b41303874fb5e96de5a6b46cb2e568"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false, false, false, "6b35a6d7c866a483b99a66baec5684b00941987cb49af49e1d613e909364988d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false, false, false, "8538f5c03fce4117c09cc4bd323a1c9361e9ab21bfee7e844ae3c9447fada728"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 162064, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false, false, false, "5f015198fce4731158127fa641cd72bf2bad7674d3d65e5d568b523c0058ee38"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false, false, false, "c72e0383ce3d43a7b66e6577ad212e7f7c97f6b66b48478bf26739d2fbf0ea4d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 155408, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false, false, false, "812d660c498a50e75c70c0c9c8b1419488e48c6c3ba096fe38555e64e8a521e7"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false, false, false, "96d0b942580095826eee683f6c9a184f8ae9e67e9257e4901f878e71162c2a13"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 191672, 512, 2, 64, 2, 2, 16, 0, 3, true, false, false, false, false, "47ee2efcc7b18b7b5bb0216b930193a17154ef6c33d147c6fb64121f6b4bcc87"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187064, 512, 2, 64, 2, 2, 8, 0, 3, true, false, false, false, false, "d22f8354830fdfd4d8390727efeafdfda47eedc9a2d5640c1011c21b03b420c1"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false, false, false, "9e392161f711083904f2d9514a9cf1f087e68bfc2e529f8998f90134c14b7a5a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false, false, false, "111912d97906e2dce4011f070cb2b39601a5286ca0e6026235c6fd3f40215036"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 162064, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false, false, false, "ab6a1f2ff9b3f432b39c626485898e8c27a1ee771ad056a300083e8031e887d4"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false, false, false, "18bb5068821c9544a80929955cd0102868fe289141db6bdbfa863a47170f3f62"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 155408, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false, false, false, "45d867f7b6e871591ce12d8cd431aeca024a519a5115213c3659a79331da508d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false, false, false, "a6ceaf8f2a99127cd846458c4043ccf000c2cde7d45f60561a59c1634b447b4a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 195256, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "efc3b5b4024c58face414ebcfd0c3343a9f13de08f2e0b483802cd397570d45d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 188600, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "61b4bd11da89b1cff460cbcf3ffdf125da09738e1d2b8635dbba02a981a80b1e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "4ea1f5726ada8fe473cd9eb05f5f5df9fc277d28a5a24aad313ccb9fa2d52f92"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "55e6175e1b2860225521062d765269c3baf7a1412c60ea4ec8c1d7fc3e505e98"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 166160, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "b78a3717dea99d90fedc6a8abc77d0ceba99d769b338260f26c1807195e6abee"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "2b44d6f3e363ca9d45fae830422b3969e08aaa9b43c97b8c97b1b937b61dfa81"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 157456, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "3d9a5a48c67e886580fd58b49322696e37196808e635ea1b4e134a8b4a202905"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "4cd1831f36f4f8d7cbfd3d0274f26f40fb837c161f06aa200f177aa2c43b96d0"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 195256, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "6932515c43a8790792d9bb5eed2766278e7aff34d43afada16188db87f8cc7a0"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 188600, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "3a1ad9753348966dfdf7c7d6290e90175728df1dc0e8c9b743ff015b5a55402e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "772169906011fdfcd7fac9fb7131d928f6432c1b0d424954f1e2b97c8de8ae83"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "3e53ba3cb4bb5604d1b98d4947a96f060d04a8fbedbaaacdbfde5eb482cb6826"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 166160, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "7399e8a7e2de6eae182e447fdab8cc4302727c1b3f0269abca908704cb7662da"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "31623824824e3ae5b20ed82f53b265c6aca6c46ddc13ef7d14b6ae20eb661c1d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 157456, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "ee18197560978c80149bf0fafae4c6b9b3a3d9b1672aca8b2cac09c49b647834"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "a0221752df029fb863f506bea5a959673b39328cd0499dfffa1069474c97eb99"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 195256, 512, 2, 32, 2, 2, 16, 0, 3, true, false, false, false, false, "6009a1e61fca6de3406b3e97f6980df5406321237046866aaae65009d9729788"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 188600, 512, 2, 32, 2, 2, 8, 0, 3, true, false, false, false, false, "a5bdaba3eeb9bb097a8974d8ee9ca2d6cdba815ac9733ecf5eccf192bda2bd0a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false, false, false, "f2005b3159b4a341d4d7d8a955621e8d4fad4781f41af7279dd93259c89cd409"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false, false, false, "68510b547744e583cd84763d6cf2882365c404eec336aa14b7fb18feba2b0316"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 166160, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false, false, false, "fe6f16cccf27935fd265df62e0031b33fd380e8a38f1fad136463b96bb50d01c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false, false, false, "e30f3023602e9ca3650ac9a85927ba020727e15317c5d74488499ec6f02f739d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 157456, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false, false, false, "da42cf751eb12d3204c9a6161987796c4fa3893cd97a6d1c365258c825775fb0"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false, false, false, "7a07a4ce96cda87b26132b8169eee7ea4dcced32968780fefd660e8a34e65698"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 195256, 512, 2, 64, 2, 2, 16, 0, 3, true, false, false, false, false, "d6e894bcaaabe54a7c610f5492237de1146378399dd338ccacdfcfc34d63107e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 188600, 512, 2, 64, 2, 2, 8, 0, 3, true, false, false, false, false, "3c1e246b8932e7f9f0e728abf2b6781e5891ed342f47ee1f3dba96b1ebefda40"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false, false, false, "a0ed3e278c988fcc1c9fc204fd36fb6cdf7953bf6da7929b5227b2f68b722d28"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false, false, false, "701ed326b5b6ac81915ffd8ae4ecabae1bc0909a80d1ec952a67dbd4ada5a893"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 166160, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false, false, false, "841b083a41b53b7e3cc1d61d3f0c7f6435be5afdaee6badc94a5de57478e6ae4"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false, false, false, "c7088e5d807d94e00617662b6a12a2bd3dcf64ce808afa53e846a6c1b26009cf"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 157456, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false, false, false, "63408df45ac7b12d6e34f4f293f5122c4afbf951f51f65a3c8a9c42bd58607d1"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false, false, false, "2fd2d3d8768f537428ae2edc614b1612656b0b3b58200cd05d2cd19f1bfdcc5c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 191816, 512, 2, 32, 3, 3, 128, 0, 3, true, false, false, false, false, "4fa2a200270964ac5066f9deed0c8361ddadc8fb6d7beb677b160dc0f2108356"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 156992, 512, 2, 32, 3, 3, 128, 0, 1, true, false, false, false, false, "98429fc9e856ef519cba48be026b823c4c55c0a92f2606cb26cca330ed88514d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen", 165280, 512, 2, 32, 3, 3, 128, 1, 0, true, false, false, false, false, "b1d788a0765f8f2ca5ff2b1c128498d1bfaced26ab793add44c45e1b2c7b357a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen", 156976, 512, 2, 32, 3, 3, 128, 0, 0, true, false, false, false, false, "4a5a271ac2b8cf1b8c3cf82eaa874a320fc263f814d8f808fc70867fa176bd24"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 191816, 512, 2, 64, 3, 3, 128, 0, 3, true, false, false, false, false, "646a65e01dbd488ba3c63ebdf52c2ffd48d4626af2bf111bdaabd93207d4df81"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 156992, 512, 2, 64, 3, 3, 128, 0, 1, true, false, false, false, false, "b24dc21ad571651b0ee8a830a2deab08faf11af80030df456b50f2fa9dcf1efd"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen", 165280, 512, 2, 64, 3, 3, 128, 1, 0, true, false, false, false, false, "60405e8ab9d66c3d16cc4cf3719f483e5467a34fe9b4034c74bdf12b9461528d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen", 156976, 512, 2, 64, 3, 3, 128, 0, 0, true, false, false, false, false, "c5ba3db03f0c1f723ea0258337c8ec615cd2099f0cda2a3aa54fa6c3d991e382"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 190792, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "feb87a6e308701407c51ea464f4fe1f076d564c546812051a59db8788523aba7"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187208, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "de6eb29be48cbf3a69eb0a49259fe668225922115daf1f69c5d6fe28eabedb89"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "92ffbefd8fd6b0287aae78550957f0d688e1c6baa8153b90652b14f65f545217"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "4f7d2c6f723baf7a9b0f68e6caa2e7b32d0ec8e6831d1c02086a160a95e5d89f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 158112, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "0bbf50d17fb58bacf98d0d9977822f8603e781eeb1853202b87dc91522f15801"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "5b79e2bc01960faf161d9c8a8bdf748b4d1cd2d2631d1ee08fd7940da319902d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 153504, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "c56f2bfe48a95d06dce3b494f2f611e566bd353187fdaf54c24b5915150c7682"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "3ec4a7734ee234165ad0de1feef98732d9d09d05f04b4e725ebfd7dd312ad446"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 190792, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "19c3be57e55dd359452c0967a28b276033bf1c319e39bb1420fde01ea73ce31a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187208, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "e9972ef39342938429a996868fc2f3e17bf3e9fc07f63ecef12a77ba8d333de4"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "37866d288361cb8e3704097dd05d9e1eb792de3ea223d4e1b1afcb41959721a5"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "9d4c76d8fd6c425d611c2cb27e703ef8b5edf0c1d0582beee79879430827593f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 158112, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "c55973ac5d52eff1f8b559f7bfc5b8412d32f5bad696f9e3dd7dd732e4852de1"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "d3c3f88903c271c6600aab7800d42003b57d73f60b37c01b6484abbb7ff6a313"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 153504, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "2dad2e5333939d5136e39703e6acba68be5e3ed02e54335901ec56effc78f257"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "3651bf6f4dcad65f8be618f78218b6f74c2df66c27c8befcc5be7cfeef42d989"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 190792, 512, 2, 32, 2, 2, 16, 0, 3, true, false, false, false, false, "072bbdcd4b5c31b754d125d202edb8249044e91830c1498ff7383e8886bbcfe7"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187208, 512, 2, 32, 2, 2, 8, 0, 3, true, false, false, false, false, "0f403f443af2431b5848f28c5d2be3808a8dc368ae6520a10e5dc0b44b07cdcb"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false, false, false, "3182396f02cd2ffc4cceaa0a70c07f997a74d67b6b3328eac4fa78ef7b26de59"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false, false, false, "d54cf13223b10dc3a7edfa06668cdb0b8be76e56785dacfbf75a4d962122d9bf"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 158112, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false, false, false, "ee462c57b76ad7f52b4235887802326a680e566fa73e114bd47d9d0090882e00"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false, false, false, "fcd37f7c6236fc302cf2fd16b10144829f80d1fd8718a027d36e361afb5a3ff2"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 153504, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false, false, false, "b99a1087e1b7c058d343bf235c8f6fe2b853e1dd41c2b2744eb41755aea2c119"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false, false, false, "b197b11f37a3311594c5dc1e5729d975be2854bca587435fe9148f83c8df4616"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 190792, 512, 2, 64, 2, 2, 16, 0, 3, true, false, false, false, false, "dddb3a0b8a608827d7c2472ae617a7d58b9965c0d5c0eb6ac2201ab0c1d0ee58"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187208, 512, 2, 64, 2, 2, 8, 0, 3, true, false, false, false, false, "20bd6cb0deec331c81212e9ab7474c075db10b72766b4a0399b4d57815409765"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false, false, false, "bae2ce0bceebfcb3bbea5117f73709e9feef3dbdceb71c85629d5c523bb62e42"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false, false, false, "7db9e58c38b448865c730e5740248895d71f55c84bad845b7e6678ba9cf1d8f5"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 158112, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false, false, false, "9039a7982dcc7a9d1035211902da0904b061ebdc2e8cee4877e6b351fa051837"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false, false, false, "5f175b3701276dde1af21909d3b3c6166a20622728804f2de8de55b41e70cab8"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 153504, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false, false, false, "f3cd36224d70cc26934b22a7e6e78461abacd9f0604a453a4e7ec697f192ad10"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false, false, false, "05cbe1ae8d7c500cd2d87bbe54be7d9c747843d2900d2867d0f0405a9f62d891"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 208056, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "dcb04b7e71196b22be2963c484fa141d580bc7a0ea52ab304f73ea3902f4c348"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen", 208200, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "2f8d7c7a60c02a06281a46dff2e5d073f693df866759cec1e635c3cd719455ee"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen", 214192, 384, 2, 32, 0, 3, 64, 0, 3, true, false, false, false, false, "fdc7f2f4bf8d4ef946ac400266e340a99e8c101a98ce31880f377f674ac12752"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 195256, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "55c94253060442ed04a0d3fdcfada3cd8cd808e44e0e780024bfc48559ee3cf9"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen", 195400, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "14c180e5a022fade0a58958b275d732d02f093c54e9f97f912571280a9c8b01c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen", 214176, 384, 2, 32, 0, 3, 64, 0, 2, true, false, false, false, false, "b2f81b5b42ef46715a122a82d84de6a4cf92c2aa7443ebfa3f4cab2ed54327c7"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 174256, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "47ce44a8669e2a7ca9fc31a48ed87ae2f2699f8fd96314e681a32cfd25fe217f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen", 174400, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "a8312c1b7676bcfb5ef1d774c911ebcca51d64deb0937e476f83b4bd9f895531"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 161456, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "0b491f09f132dfc6ae53a8e81c7461865bf36b8c8ecf0fb2120091fa2c56ce96"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen", 161600, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "aad7fa5dcc449359d4dd079552f7e4d667d3d96ccdc8fb694f0a60cc7cd1feac"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 178448, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "b583791ccff1f249c0c83ba32e92fb67dccc9fadb41463eab4d3b5e8c17b41f6"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 174256, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "ef6e859d3f80c08513b173b204720132155593abd2a8ab81dddf02b3619a38c3"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen", 176544, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "fb20f53bf763202c1d485f1488c79943d9c69f2326067cc07d6ec9dc6220a4a8"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen", 174400, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "6e31806cad5e7f5604a648fd357e99c76fffc59813817f95fe5a710c7fd4e8ae"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen", 214256, 384, 2, 32, 0, 3, 64, 1, 0, true, false, false, false, false, "bae0aa5a1142320fb5fca5d8f452bddd7b635e2f87baba589bfa1f3367024a56"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen", 214160, 384, 2, 32, 0, 3, 64, 0, 0, true, false, false, false, false, "12a4c29dfe7c01581a6e587ddf3ec3ce965a3cd850b89094adcba2ee155afb96"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 163600, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "a0b18282a1ca6fb739d676626fbed0f8bb55bf6cc2ebbfe93c94db132e9c8af8"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 161456, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "6def0d50f19b2f1105be0081f05e2cb33d93c7244b097b30f4b06cbb61146e67"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen", 162720, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "03dcab2d211d2ee86007fd736ed51522dbeeef49755530f4ee1d27965b2326b2"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen", 161600, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "8a0b2b167104390c23539d847230c4779b40c7f65eec4408821bfafa0b8e6781"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 208056, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "59a84caa53ff28c355a3bd5a4b18dfdf5b521b202a5e2de6829c1ad550a46a9e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen", 208200, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "936da7d57b46582d0244f74f67d493c8fb55afc73b6f4f71bbebd033df9e6d22"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen", 214192, 384, 2, 64, 0, 3, 64, 0, 3, true, false, false, false, false, "5f93628e45e30518847b5c3b3b90ecdbac9f89e5a627503c3ef4fd3084fca2e2"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 195256, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "64728e41e867677310442222921b10eaa6d7ea69164c015ac2c7521a26cd6b35"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen", 195400, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "c54e3efd1d9bea58090a4399a8d49141ed6aedb0a5db1f4e2b908397a6a83369"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen", 214176, 384, 2, 64, 0, 3, 64, 0, 2, true, false, false, false, false, "b8eb35b65821d9e7d27167a013ee98cb19e41b92cee3bc460d6b26fac367b3ca"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 174256, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "92a3daa56ee64e471411bb8e7d5ebcc95ef37610f2656f3fa79787e48753d904"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen", 174400, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "6008ff5e6ff90cbadeb6c6ad8a58cff42c365238bc57b0eb4f5e34678f4e739e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 161456, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "ec666a46dd0e85a7141d4bceef7cb7d49a2723f69964a5387b558c16a09d3b66"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen", 161600, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "efe1682bda26700781b470a943541d6b0a8dc816d79d283342f75a51049b437e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 178448, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "b0238c325551e8542046878e3849d7cc55a5e2928198a8e4182aba8a733e3214"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 174256, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "db91d843cca016c7b3a6ce7848d23306cf74cbe84705b0efe5b4cfec0c1c21ad"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen", 176544, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "1fad8affe33e80c65713543eca1c1a0526820be24368c4256b85150b49cb2b73"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen", 174400, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "93c6f93e349bad2f62bdb1d8b7e269fd2966bbddb5c806380f62b8c1639f9caa"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen", 214256, 384, 2, 64, 0, 3, 64, 1, 0, true, false, false, false, false, "5f9d48e5f6bdecdd997fca5c40e90fa61088fcaa3173a8e245ce33144de5a53e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen", 214160, 384, 2, 64, 0, 3, 64, 0, 0, true, false, false, false, false, "e15de83190e190c2766ae241e223d202f4320bb31824b8edffb8ba464626a8ea"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 163600, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "4be24517715ec7c5bf9c06fbde66fea20d023c5adc4ccea86df004f37c3edc05"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 161456, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "44481d2273f7a6b7a5bd8161c91cfd1c47cdd077a22069f5a64d711f872a2544"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen", 162720, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "30f787700cee3cf764a6071c6c5a3799a8eec3077217bf2e1a6b090e8c497a4e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen", 161600, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "ff11ae41247c4f9c8c9665987cbd9263b380f59527dbfda1b67d6b42524d4278"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 213160, 512, 2, 1, 0, 2, 16, 0, 3, true, false, false, true, false, "ab7e6cd8ad4d6e651f4a0d7556b4c45f09aaa56dff09d33556170ffd1161cc1b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 200488, 512, 2, 1, 0, 2, 8, 0, 3, true, false, false, true, false, "59b4a1eb4d802abb700c74380d2544f7eb46eb6a36544575e96870f5b839df99"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 179360, 512, 2, 1, 0, 2, 16, 0, 1, true, false, false, true, false, "661c2fee03953f4c4d900c9b43309822434314d748379cf814783b3c60f2068e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 166688, 512, 2, 1, 0, 2, 8, 0, 1, true, false, false, true, false, "d5073f4dd622f572c9b56ad566b3f140e413a7e58c2a9fc096aca78d28356e43"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen", 183552, 512, 2, 1, 0, 2, 16, 1, 0, true, false, false, true, false, "ece8266f96e977a4f20ddf79812632923d636e35be487d30505ed4f034b578dc"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen", 179360, 512, 2, 1, 0, 2, 16, 0, 0, true, false, false, true, false, "1eb4a120b40368b28d68fdf3a543ed7221712060676c0338827b0e68bc892641"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen", 168832, 512, 2, 1, 0, 2, 8, 1, 0, true, false, false, true, false, "7b4e2487c6010afffb8c44ab1afa4a5bb81c1df4256dbb85495189abdebe5134"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen", 166688, 512, 2, 1, 0, 2, 8, 0, 0, true, false, false, true, false, "9ed9768ed56d860580a465862dd3cafbe92ab6847a5b2e1f668f9bab0d8c9dac"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 207544, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "eb5e0438004001f2a3e4066792db4c9a4adfaf5909609ce805fb37c4d0021771"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen", 207688, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "912766fb468111f7842eb946d77a01c01111265d82f18e27fdf4b855806c60b1"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen", 207464, 384, 2, 32, 0, 3, 64, 0, 3, true, false, true, false, false, "d4a76299007536dab31c9ab01ed19967b3db1fa0e9602d54c511caf9f0bc0678"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen", 214192, 384, 2, 32, 0, 3, 64, 0, 3, true, false, false, false, false, "bd94ff92cf0522dfa390314491dce1f1c01db57a5c10ff285846af0aad722f41"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 194744, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "b7a938d51c6b403552d97a5b08fdb9b7affd4cef853aa5bd6f24362d104cd956"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen", 194888, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "9d918982cb886de9b9cc4d3e9c4f4352dc55e4ccc4dd6833ac6696c03f0b32d5"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen", 207448, 384, 2, 32, 0, 3, 64, 0, 2, true, false, true, false, false, "6bc1070a81603091b5916a87439a973bbda1cc76d09b3df786a226ec4a031b8b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen", 214176, 384, 2, 32, 0, 3, 64, 0, 2, true, false, false, false, false, "9b34d0584bfc981e893a41efe1aa5bd33bacdc154db5f731c5a81094ee46f994"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 174256, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "63b66a4a9e8c9e1a6074eb8860ab13fd6fd4380cdbf9199f766fb9416fe9fe17"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen", 174400, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "a6bafee8bd7ee6cd9daf29b9d4ab5c4075a210a65ec15c44f2d251634f563128"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 161456, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "d9da6bf31c545ebcbdf0983b93a4100f8eebe2e59a114a3c40d92930e25ba750"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen", 161600, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "2617509ae870202ae456a1894923656aeb64694c2f9364de08d43d6e17b8c0e0"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 178448, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "4709737f0738e7c74836343cfe7e7b9287dc9778584f5198455e46144d6f8d47"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 174256, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "a3365f584931226d4c5153d70da6755d25539d54358e702fe19c261c72362394"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen", 176544, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "3361232b68a9cda5736680e4e726b266c3889991a0c9739d7b599cfa17e7fb79"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen", 174400, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "5b3aeb985905e62e3ba766a5a7d9a477d73d93b998542f65741cd8b235ea9f4c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Persistent2CtaKeepsAbForGen", 207528, 384, 2, 32, 0, 3, 64, 1, 0, true, false, true, false, false, "9b3d7a8492a2e7e014a4d130b5e43ee2db3b9c5efc7832273ccd78f287d39da9"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen", 214256, 384, 2, 32, 0, 3, 64, 1, 0, true, false, false, false, false, "72dc4257b716c7b2442cc0ea517ea645556f75bfad5653117682075b35226b52"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Static2CtaKeepsAbForGen", 207432, 384, 2, 32, 0, 3, 64, 0, 0, true, false, true, false, false, "aa2d56e5c251427c32edcab60e0cec579ce08f98bddc081b88e89dc75e492008"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen", 214160, 384, 2, 32, 0, 3, 64, 0, 0, true, false, false, false, false, "5b2d760bd2590e362fc3c37809afc2dd81b4f86c5478a4279cb9f8cf837da215"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 163600, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "a0f129b3adeea972033bb604cc82b4402e5a224af5fbda3618a71b93ee992837"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 161456, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "7ec8328efd23f0ba6ae8a89e482c42dce09ff40fbeaeb3a9d6db7adf6dc07bd6"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen", 162720, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "592be1c539b3d221f82f474f44f00a4f37580774380c06a4c719c835e5198032"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen", 161600, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "b210fff426f228451b1d7389ded154239a406d63fcce98bf5eb5ed39662a4e11"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 207544, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "d7cba359499943a46967eb6c15c852a0d94c6a32b53d3d27245e8e5f2b5de464"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen", 207688, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "8b8c4053d16abcdc55a005659b5a0ba86f14165b8562da5eed72b2b9e3ac3f23"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen", 207464, 384, 2, 64, 0, 3, 64, 0, 3, true, false, true, false, false, "bbf7f0c667d57f6e79697452f14a2157b08d0b62bd75c8043abaeb3da4bcf499"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen", 214192, 384, 2, 64, 0, 3, 64, 0, 3, true, false, false, false, false, "0e676334c37660fbcc6b6c3271b98b95b41ad89fb8909f88b710ccf012a55749"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 194744, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "58c57161c1e22e90b9a3240511de435f2230da5f0ab7aa44d9bd3c324c161b2c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen", 194888, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "7494a0700ac954543cd4afc779d6193cde92ed0b866b62640200eb77733db81c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen", 207448, 384, 2, 64, 0, 3, 64, 0, 2, true, false, true, false, false, "9511cdcd851eee460bc17467b583d3116b81e2881da36a6defab97e2b72823fb"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen", 214176, 384, 2, 64, 0, 3, 64, 0, 2, true, false, false, false, false, "51588117a0b3628acbd487f5db2f5e44e925af1061992683221d0ad5a0ac884a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 174256, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "fbdae4fb19954afa3ca9843b35159f7ee766a53281cd2bb611da75fb903e2f02"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen", 174400, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "88a18a30adc4f899b1c0f4b5e399555e6875279cd42f2877042fd4273359614c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 161456, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "f4ab38ded10e1d9fb0f86a56c40d84cacc95bb9ac7db8f52f9e111ccb4346da4"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen", 161600, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "a0e2c6fb0be32ff4649d698dfdbef740680cef69fb833151e2ef9ff552b2731b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 178448, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "970156cb6048e24393bb669a5b40988a26a49e32339d865f77cae02c10fbc9b9"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 174256, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "287b29a164fdb6474ab047967fc52426ac56f78836e60493fc456dd5fb591b0c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen", 176544, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "052dee6bb49f77dfdf3bc69d755831165c6dc0863071e29de916f451c7257567"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen", 174400, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "730418aac876dd0ff12772987d59560129c680fff5f320b049f597cd67c41b1d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Persistent2CtaKeepsAbForGen", 207528, 384, 2, 64, 0, 3, 64, 1, 0, true, false, true, false, false, "0f4edc5c8017355d2f2346598f7c2f15a77ae274c2bd81137c385e4d81ddfee6"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen", 214256, 384, 2, 64, 0, 3, 64, 1, 0, true, false, false, false, false, "925d7198bdbdb012a064b2e81550969ca5ad2a2b09090ce5f2f025079517a22d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Static2CtaKeepsAbForGen", 207432, 384, 2, 64, 0, 3, 64, 0, 0, true, false, true, false, false, "b54fe8483f8a3e1e3b385568aa09459890f63965df8a71072a51f462217a3348"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen", 214160, 384, 2, 64, 0, 3, 64, 0, 0, true, false, false, false, false, "cf15b47000b32a5cfd77a1d975032d15c41bcbec3258736fbe5b6b5636f60cfc"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 163600, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "76158e329a8a3749f7a2593178f11137e3ef5c3495f0ef3d7b8ed852708be7a8"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 161456, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "1963f2f427a216c4ac93fee76ffe1961455f95b8c848534d8982c20e09a6ff48"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen", 162720, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "3ddbc76957fabc50e9076fcd609c6bb85242f9a886cfaaa7d1fdb69e4f815a1d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen", 161600, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "44e7c92b4fa816e04fe1bd98a17234e2a282c4dec3a02911d0e7aea2758f0fea"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 212648, 512, 2, 1, 0, 2, 16, 0, 3, true, false, false, true, false, "7a430d90226b64b9fdb15d7e657ab791d877e6eb4ba2f555bfa85402bf7e1710"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen", 212840, 512, 2, 1, 0, 3, 64, 0, 3, true, false, true, true, false, "312c6692c09314f1447db45ff20b58ecc3855b8260f2b3706e94792bc30a59a2"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 199976, 512, 2, 1, 0, 2, 8, 0, 3, true, false, false, true, false, "01526b8d1f158495d9079189bba0d8fa76f5ae041647a6ea281bd8af3d24d724"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen", 212824, 512, 2, 1, 0, 3, 64, 0, 2, true, false, true, true, false, "3edef97ddef59e54572448d8a655a531740e6b9ebad357c266b16691a3e86f72"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 179360, 512, 2, 1, 0, 2, 16, 0, 1, true, false, false, true, false, "f33113f71eb1cb2c46ac4e5f6f13764e3c34f8526fd1a3274ff75d75d6a9e961"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 166688, 512, 2, 1, 0, 2, 8, 0, 1, true, false, false, true, false, "98aefaaabaec647784720e2076e4e72d2c0959410c6a92d0593aee7862229973"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen", 183552, 512, 2, 1, 0, 2, 16, 1, 0, true, false, false, true, false, "fffd29c013fa455f0816dc371509021bf3dd296547db9208d680c942353fead2"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen", 179360, 512, 2, 1, 0, 2, 16, 0, 0, true, false, false, true, false, "de12c878d1bfcebbda2943e4d36abc6ea81bd48613a932c4e6599fbe51009342"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Persistent2CtaKeepsAbForGen", 212904, 512, 2, 1, 0, 3, 64, 1, 0, true, false, true, true, false, "5e77cbdd92a6bb43da73141f8161bd3033affe6df7a0e95d31dc44229646028f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Static2CtaKeepsAbForGen", 212808, 512, 2, 1, 0, 3, 64, 0, 0, true, false, true, true, false, "03ffaac28071f6a603699b8ba53eb6c9ab59d1703865be80eac658b37dd73407"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen", 168832, 512, 2, 1, 0, 2, 8, 1, 0, true, false, false, true, false, "86e17cfbeda923c534ab78e9ef7dca994460d5a3da294d7e0ad78c3fd54694ff"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen", 166688, 512, 2, 1, 0, 2, 8, 0, 0, true, false, false, true, false, "c1452e0f66f4d506e4633083647d2d48f724306454a26063c1b7302cd1aebe09"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 207288, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "df96562b66d42dbb0e36b1bfcb3914a22a2787ffb88d2234593f89afbf14bbd4"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen", 207432, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "20ea4938089ca09691eddee6b51ebed79cf29ba5091d76c009a85dbcf79a3482"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen", 214192, 384, 2, 32, 0, 3, 64, 0, 3, true, false, false, false, false, "6b285cf23e49b39ada04fdf91b82f15c10f77b253fbc22f8d53e30b576fc08ce"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 194488, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "baa65b2802417455247287b51296d5351e9ec21a3b68e5edf37ac838bf5ba5dd"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen", 194632, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "6001f4ca71894e42398a155aaecb681ecbe1471bf1a0038162b7c97eaf9d414f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen", 214176, 384, 2, 32, 0, 3, 64, 0, 2, true, false, false, false, false, "395fe57d51eeb05caf73bf656fb59d0a95673edbed9ed08a50f2388d7b282363"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 174256, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "958ab4386e2baad928671ef1ceb44990aa8d7de736694dd1e437291b4c1cf10f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen", 174400, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "245e4923ef49ec8f72923353ebeaf3819cfec14da27697bbeba5b0dcd2096f37"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 161456, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "2ce05e995bc24ad14e308c9670487a9dbb8f1b36e9ede310c3906559b69cf080"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen", 161600, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "c9bf063f368d449960f1bf4253296990f7efed663946c0d41b7fca232fcfd993"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 178448, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "ae6feaee5176cafd6d7a4b1ef23a4ab276f62070882977e8092f38d9419ece8d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 174256, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "86357637085f161c4a3bcc4f4ce5c8872bc6a2623cb88fa6857b2452fc27afd4"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen", 176544, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "f956880218fe0cb2f5c0e4dcee228e1ab098eb79e8516a398c24003afdf3ce47"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen", 174400, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "55a44e3c4ebcc4da151e795adf6e7e6fb508187915695358eda09491cc134149"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen", 214256, 384, 2, 32, 0, 3, 64, 1, 0, true, false, false, false, false, "d83e71324a541a5e449428628b08ee9ff3d9c345e535aa47ae4e5010a71ad692"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen", 214160, 384, 2, 32, 0, 3, 64, 0, 0, true, false, false, false, false, "6b6f9806a7a9bf29ee23319b1a56b73198bf5479cd51d6588556ccec19a79001"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 163600, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "64e6972e7d0667e95347b3ec77831ffb8daa3125abf655e83dbc4fbe2307ddc6"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 161456, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "3d25e6e61be5d532ea624e5cbf338c31c5108e4caa0a353ed9be1cefbd818840"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen", 162720, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "288b7fd257f39f4c4003b8e2b725c8e7bf81305c4deef0deaa760962f37a383a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen", 161600, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "004380b373ed3b1a6dee2fc0357931d8b95f60a5405051bb4a458c20cf6d2e7c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 207288, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "c9adb20be2409a7887ff0c8ec674e25139cfe69339a2f41f9a8fad0d850e1881"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen", 207432, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "7c84acdcaf8b33cc6d080306315e231a4b368f064c19ae927b458e769a7d59e5"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen", 214192, 384, 2, 64, 0, 3, 64, 0, 3, true, false, false, false, false, "5c4dfb23555feaaf890d8a5ce486f9f930b5eb58c1792966ab425f81c1f3529f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 194488, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "fd478480f6c34a66258f8a95db81af9d79f0ccdea447ca0902afbfcc26cc97b4"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen", 194632, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "bb6c6538d53872b4def891e073213569713e3dfb2da358cf7d1444b0e68f0943"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen", 214176, 384, 2, 64, 0, 3, 64, 0, 2, true, false, false, false, false, "4aff95153f588ba61a100df4bef198ef45d1f1cfe22c7c4ab2088131023b7749"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 174256, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "886a4659633988a3477763f2e5c14502bef391c3798b56323a8c631ea3de9767"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen", 174400, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "2b45459ee91427734e02d5fd067d0bb254a4a56ebd3acfbabe2ea82de4d76a14"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 161456, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "2f09d79d869535c44e914a6781a9545d0ca7516aab54844f480099952eec131a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen", 161600, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "3042c84635f2d21848bd3e5fec4cf827b2b284a1dce52dac220f9d8be2d37161"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 178448, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "94f046e819cfeef9db61278580a65884a3274929e74665b541a17e02e03df9b4"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 174256, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "b77490ebf4c4ca77071bcd906ffa5c05391d54f5cadab910b9ac0f5f9c3cee44"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen", 176544, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "d83db0dd9d384983119e00c41c5f2ae2df9901b85671df74bbc8dcf886e31452"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen", 174400, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "83e28248adb0f786f92b8f67eb5f6fb1a911a92173980f3c1858daaaa7d98c77"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen", 214256, 384, 2, 64, 0, 3, 64, 1, 0, true, false, false, false, false, "3c3e1edd6d20cce4707b6c87f7376b452e8c313195e9bcec032726bc5c235f30"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen", 214160, 384, 2, 64, 0, 3, 64, 0, 0, true, false, false, false, false, "21f08cf860071fe53dbd398d38b074febd578405558bffc2a6f29b960ed2c449"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 163600, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "adca635c7318ef4e3585e8b885ddc7e4e6f9aa962f8c33aa7d1b513bef952361"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 161456, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "a519dcf3cd845cb720129d66cfe672b94f9ada37e2bf3b28cf1ccbe64917c2e0"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen", 162720, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "c45e98dab8f7e8f76965b00dbd22192d1566d7a2e90c680ea9a711596404f79a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen", 161600, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "ac842d70c33d2d6c79c520bf9c643b2bc0268b414489e2b64c77e3bce466fe73"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 212392, 512, 2, 1, 0, 2, 16, 0, 3, true, false, false, true, false, "868aaa29a22ceaf3d222cdf8351adb72f52e640ef633a692d0870f36a7b4d970"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 199720, 512, 2, 1, 0, 2, 8, 0, 3, true, false, false, true, false, "a4562231e55348242029b354c81a7b0b858548dc83416ea37c6b0d496cb92d07"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 179360, 512, 2, 1, 0, 2, 16, 0, 1, true, false, false, true, false, "f390f9312f9c8bdc60e1d954543210a10340fe5eb65554b1b53f144f6b05d596"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 166688, 512, 2, 1, 0, 2, 8, 0, 1, true, false, false, true, false, "5a88a36454e9f29fa85b18daa562b96a87044b0216c8988a3fa343abcf6912bd"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen", 183552, 512, 2, 1, 0, 2, 16, 1, 0, true, false, false, true, false, "e90f36ca2c667ad9d6bfce69e637aa6f107e565fd0e7ba23b8e2f73af3d55e1e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen", 179360, 512, 2, 1, 0, 2, 16, 0, 0, true, false, false, true, false, "6ecca10d29cca5dc5274db435cd14630ad79a2156ab8b8bd2f70fa9300fae2fb"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen", 168832, 512, 2, 1, 0, 2, 8, 1, 0, true, false, false, true, false, "3728444ca79320d2fde8a22b45dbc3aec9130d89acf09745315a05f56950f2ba"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen", 166688, 512, 2, 1, 0, 2, 8, 0, 0, true, false, false, true, false, "6a04b67d2cd58fe338c0e3f6095ca87a05a6b86a0d7ead7c839e6147273e5938"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 191672, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "c2fbeb8a06534ef03a405f8249586c6ec4eacba55b3592d35123437a8001c4b8"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187064, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "4416f70045d3bd3a1fa4316f21ab20f97627d25dc56eaf96557acc47341687ba"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "9a1aa67c502d252a68ef69a60aa5cfea566761cc9554d3b2734c982e8c22df37"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "2ae6ebd884ef0bc81e0bce62f87a99ea6bc9705e707c0c5053a102f6472f900c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 162064, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "38640276af4c9e4560be2983d162aa511c3463895cab33371c49c0d9f215fcff"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "4a23c72d69e0098418ed2d76f5d45542c76abadbe9295de152526aab6959c481"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 155408, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "f1cc9ece73108be520e1a6e174b60cced74488e0adb6fb309bc3d4b53dac300f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "4f9ff7da823ab0b7ab8af298ba399abf20c551f5b3653374888566aec1124c46"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 191672, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "d7e84f805e73c387dc02e8c58d0aba861e00b31505039ea2f75b6e841bd3146b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187064, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "1bf98b254e090cf12818681a856befca3f4e0acc9901c4308cbd41e400d87108"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "2e5c3f55a042c8d6088487fb6345ff54bd8114c996491a14e22f1f6da729fde6"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "bd5826df2abb809909768ed21c4a9f0caf196226cde9f6265c056bc2154a0af5"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 162064, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "9a3131355375a1fd0a15ae3a794ae4b833ae08050b352c26c35a69db472e9f9e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "8fec20f3e34bab9254a3cfa3e6825ac63174ae95de8d759a7f549f6b85f8f66b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 155408, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "6276c87c20e74f52ee533cd3de4d155ed667fe8ddab1c3d462bc889a4181b24d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "218c0cec407d486155763caeed6e01875acd39f56821c98c4d51b6e1a9e324b1"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 191672, 512, 2, 32, 2, 2, 16, 0, 3, true, false, false, false, false, "6bdc265e871ddea1d5ab4b283750db9fb5106ce200096105ae546bf73653e5a5"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187064, 512, 2, 32, 2, 2, 8, 0, 3, true, false, false, false, false, "1259438aaf03b2ea4ef1e5c4e475fed5b5aa26982846de546483fc5de3da05bc"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false, false, false, "a2f788c3cc32b27fbebdd785539ff442736c2a2daef9de08972cbd0dd5c44809"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false, false, false, "ec215e0bf7c1e67ac8363ed865f438ff163d75554d6f58adf78ae90708de8d58"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 162064, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false, false, false, "b67cae85eb29a6850276f815466620ddc27af33f418e07c5c1adb2adfcc333d4"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false, false, false, "ce451b9fec6d6e02445b307baa7cc67d3736fa35b7c7ba7e8787455356061957"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 155408, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false, false, false, "c063377bdde178156a6cf9e5ee66d66b4e46b3b38462c69cc5149073311a6b2b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false, false, false, "fa3903a739af30386600fe70526d57ea95b888d6ff75be715f63af1ff9f65d63"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 191672, 512, 2, 64, 2, 2, 16, 0, 3, true, false, false, false, false, "d2dafd5f13444c3e7fbfaa3a8ef4d3f0cd3d628d5127062ae12b66408c40a3ac"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187064, 512, 2, 64, 2, 2, 8, 0, 3, true, false, false, false, false, "15080abd48811bf59173ff99b05b3063ff98e149efdf86d7bb0d9f52411f07a1"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false, false, false, "b42825600d380c4177eafb56432031f9cae7072666ea2bb795d38c133af85f7e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false, false, false, "6fd16aea5ae20691191dbf6eab61382aea78e4ed6568320c7ec787a5f625ba47"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 162064, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false, false, false, "3e130155bbfb7dfc8a2dd75fdc9053de81861638db57e78ac74479112725bb43"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false, false, false, "5cf5cc9e554f06264b2293a7188914e936004c8e1fa051675e67a44dae846f52"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 155408, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false, false, false, "486a1482a21a650669bd449449c9b682f04d3abb2c4011ad31fb3adb1479187e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false, false, false, "b9b0b5d528859313c1213fec88f44b7f945c38d20b48903f61b0d5ba2db9f331"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 195256, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "5a37a812b319a8e98161d7acda55e1e24f1fee0078ebcd829dac3f1daf99221e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 188600, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "1e496dec8e830de3d70a83fe0d77a508c9cb4b0d260f2d1f28652769e3adc495"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "803d191f5170528e42b6a7793c4f988d0bb363fa9dc58bbfb74ffb7bec6fcd4e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "2ae41a4183f9e40af0310f716e51f37b08e5d5b5ab02b210bb940a9856bcaada"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 166160, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "90a22aab4a2e5767212fb5d4d3966258cccfb0b8941780957c0328267434134b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "479423ee65b5f8822cc3c0f4941b8979de51006bdc763f7115c2dbfbd73f4181"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 157456, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "967fd84df9638528dc91c4d37812b295d10be16f7ec5cfd01a1a6475fddf76d3"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "8e38f84bccee68ae1433a317c1cc87039a9aa177d4319b27dc3b907b6802a25d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 195256, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "c9ba9ede20fdc5b19a5ef23c026e954977f25baff3f471b0d1b6d38ee85aa08f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 188600, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "3838ff0a9f9933f7b9857abdb4ef686f7610219cfa2967749e34f9836ea16139"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "33caaa2f2668af95b1b826e6aef6f059c9bd98e9e098f5dcdc2294f26718ed4f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "b6eef48a9925feba1b6e5ab83c902e4f476693de4c2e438afdc6e8a1db6847a6"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 166160, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "bbe9b4a86ea0e39bb94f9acd20c1a88726d783d6b1cd387764bc7659567d16ac"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "9c7295ae447c270ae063d0c43ecb96fb642ecb669bfa196fb932d69871566c52"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 157456, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "b17fc2014848df9a0019661db917cdb02f0c8090ed381f4a7cd4c07c0a2f078d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "1bd384f528dcd572c0bc86a40150cadb633587752abb3eb5187acb2ed62e3f17"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 195256, 512, 2, 32, 2, 2, 16, 0, 3, true, false, false, false, false, "7d7f079c618b9c96242f4118f00904e08d58b49545c5bf099a1049befe0c5ffc"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 188600, 512, 2, 32, 2, 2, 8, 0, 3, true, false, false, false, false, "0f8fc620d6c3a39dabd31e8c1ccfb00fa2fe947d29d9d05089d54a0e4367ffa1"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false, false, false, "c8d98428db207ad56539079c3c3e9f7f58994c210a3ab1d79913b0d2f553f61a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false, false, false, "2b07148e4af01a87dd5512b4f2ab0329ef46a0607ac11dd17757f2b82ad51c15"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 166160, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false, false, false, "71b79bac30b854ea7415372e18ac3343bfd93bf915e1f68ba486c47ff40be56b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false, false, false, "6c5068b2d28782c73528de1a597d191f5ad33eb22d1e5e40e5e8651e28c1eb33"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 157456, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false, false, false, "f01b152f42dea23d3cf46a8c6f3817a03414ab308192897aa1fa164e98774261"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false, false, false, "acf852a2a8616045ac160259f2496cccc1be16781aeaa4909fb70ba4ae42fb14"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 195256, 512, 2, 64, 2, 2, 16, 0, 3, true, false, false, false, false, "000182cd205fe7f6c50b1559af24ce5cb354cd1ef8fa5484dbaab5e405a22a68"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 188600, 512, 2, 64, 2, 2, 8, 0, 3, true, false, false, false, false, "cc9ab4dcdd661ecfb9cba408d4906bab570002cb31a64a92489399208f29327d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false, false, false, "d959eabb50aca38d52c6ffcdb8f3a439f85787f45ce7dd9fa35b8f6319ccccbe"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false, false, false, "cc9099bdf318b58a5c3b64f480ebb2936eba10df2796dcbb7ec09d397d0832a2"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 166160, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false, false, false, "a04c4bc737bdb35e579b97181e5120930696022ab85d9a0a47f055754d623673"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false, false, false, "731850d0fa4fd2b8f164b9ac337ce770b019ccfbb0432c4e52c34561197512a5"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 157456, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false, false, false, "344bd10977e88e9e279ab3ebee1742e1bc5999b02cc214ce6570d71d410c935f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false, false, false, "671679bbbf1be35049a8f349e6e11e6a2f936f84411652d15f06545f225350f1"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 190792, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "11e5793923f273d607b07a0168047a80862b0a7570e43edf3a444e5b1c341e48"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187208, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "9598c00cfa636bbb9e8dc38557d29a6d74dd0c744898cdb1932cb73c7a63e264"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "66fe5207667f55b102212281b4e6143870fea3761f4787a6734e2962c1c2c3ea"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "2a906d12b95eb686cdebae60bcd2b6677c9a39b0cc684beba855159e8c89b4b3"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 158112, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "24fc9ad5e9589174867c939612ed7e6bfe3dce899d231998c0736ef4c4130be4"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "f1959e4c14b3fce94daa2e60663602daa1e3d90d5e5ae0433b9c828540546646"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 153504, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "79b0da6dc21a0091d9051267df7af9abe66aa0bc6eb3fadfd3679265fd0864ef"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "56fa4ef94d940f244594ff51d84eb332c37c790cbb24651d8a2082c38dfb698a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 190792, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "cef4fc28b8e952493a84054023fd59d2bdf9727606b051ee778817b7e6f1def6"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187208, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "842866f17619cc904f94ce3b00d2b9bf6476bd14a261c360d9ac90089d33103f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "7b7fb7bec34a406738dd7d53f02210bc3385be0e0a328fe02365e8b707371546"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "7037c2c3ff393f12ac9ca0b5befa156f06baacf671eb8700918406048302f408"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 158112, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "c08f36b7c89c337c4eccc6515ba1928f90aacef6cb66f72d8ea1326ec54403e8"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "cc0fa0c41b392e60648c1163456e80be483f34942471a19c5781b29545b38787"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 153504, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "9863bef2f4bfe9f3297c9adfee82acfea6ddf2100ee4380871a4a3f3ac6175fe"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "1a79c7e90e31bf8c8a0690ce1aa418af5e4a414f2f08424afa3318d4a13d4242"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 190792, 512, 2, 32, 2, 2, 16, 0, 3, true, false, false, false, false, "d46acae22406d8232091e22782c90c344bf4dbf62a1c6fc0167c7c53e5a95e0d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187208, 512, 2, 32, 2, 2, 8, 0, 3, true, false, false, false, false, "f0548917bd159af25ad89bd83a48d0bd7109e5379cc9c96406a2b7a951bf98d9"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false, false, false, "eb88a904237897562489eb48dd83f5f8cbf52dbc40c5e8f24ac1929af52d77f5"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false, false, false, "910470143109cfa41dee4239dc059df76ea56013f4a0ba9afadf1fdcd1d6a0e5"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 158112, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false, false, false, "fe31a29b66e5d5f086cf086210a6c9df9e461d01a8c17e37139280ef5eb93218"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false, false, false, "f8f75c68a230e30ee7d623197ca831dd98529608a82e823625255a8cb6e7a995"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 153504, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false, false, false, "3a8e0d7e65eb1ad17f776577ce38044af71efad72601d5e4f50b8ea2e5a7f219"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false, false, false, "cd3bf335f749202b056e7c27aa6db0eedc0bf35aedd8523f4724306103d8e6ad"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 190792, 512, 2, 64, 2, 2, 16, 0, 3, true, false, false, false, false, "fb89c9d911d299d1d0885c425391bdffdc485a8715f5181174db0d3a58b4e520"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187208, 512, 2, 64, 2, 2, 8, 0, 3, true, false, false, false, false, "32624696e44e583416295b7c4665ad9835f3a22daa6e0710f420078b103d3278"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false, false, false, "ee629f4a3faa5e22b986fcd3bcaaef62a7630002f5a04d6372802f775cce3aa5"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false, false, false, "f7b0d4120376ac83f65e10621a42580563c4eea967d84da713e29dec4fb98cde"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 158112, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false, false, false, "fac91a1c34718c810334d96adaee1d22e1ab3c757bffa0ebe4c0b65e9eea6b25"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false, false, false, "72c513003529699d803623fe86e80c4981cc7bc051969e7c272a812698b7c768"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 153504, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false, false, false, "f6120b4c050f7c756bf7be9eef5a9c3ddec61a7109d3d966ba7cf7210ef92e9d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false, false, false, "dba53dfb1e56d2af0e2d6bbee54782d9d3979d3042e0f8671b3a588df21e7119"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 228408, 512, 2, 32, 3, 3, 128, 0, 3, true, false, false, false, false, "6df5bfcfbdf90427abcb9e8f13f561fc55f08cff420a0477b566251b1c00eca3"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 165040, 512, 2, 32, 3, 3, 128, 0, 1, true, false, false, false, false, "fe99cc8826b593b63bd21cb8bab7e0edd4ed50f2f4bfacaa8720033d3fb3cd6c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen", 181520, 512, 2, 32, 3, 3, 128, 1, 0, true, false, false, false, false, "95f8710277c1655bfd3d7aa20ba889f777d671097000d3c4af5bea631f5bcf88"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen", 165024, 512, 2, 32, 3, 3, 128, 0, 0, true, false, false, false, false, "c76171127ede972dcd78277a8f7909051bb9609927e61d93e85958c635eb503d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 228408, 512, 2, 64, 3, 3, 128, 0, 3, true, false, false, false, false, "15ae7c54e0ec8f1e72972f0d39aaefee479fbe1adea8be846e13aff778490272"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 165040, 512, 2, 64, 3, 3, 128, 0, 1, true, false, false, false, false, "b6fd59ea99d593bf949677d70b3463fb72dc0e7e6c56abc34259513e07a485dd"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen", 181520, 512, 2, 64, 3, 3, 128, 1, 0, true, false, false, false, false, "ab392a85cbbde5a38d384b17d82ada2e8ae3cd2e244a43e09a8ab983afad7e7a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen", 165024, 512, 2, 64, 3, 3, 128, 0, 0, true, false, false, false, false, "319c3f80dacf44ab27188e150b856d64d9217406b3850cc709b3a2fe8ade8a30"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 191672, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "180283ab1e08ce7c5128e5223947c8baab87efff7baa91846834381aca9ddbee"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187064, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "db90cc96cd3049adeb1f39a26f84e36b78abfcb9da606cbc8252c40ad3e1b555"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "783f821c8e1d91ad6841dae99a42ff2ea11b8ccf41a091a3c4b43252c42787c8"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "4c0bb6a97edb149fe9908d10dda34226256f6348112bb6e646c28e8372a96081"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 160016, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "6999c3945b25faeb518e358b434203e45416d1372b909927c67b53619ba78b05"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "4947d0252e39471b7692c408ccc95f6cf543a9f969d156e0fa9a69a625d88890"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 154384, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "3597b3f12e3b4db416adee4aaea46c4ba3268d80109ec8f308a78c6b69752b80"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "99d054b2947fab7904f64907b66bcadc5eb7b7429c47fa7e93730976b3e1da59"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 191672, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "c0610b0f058c27685f32621a73f5e23c9b95ad81e0723fb78cc66262537d1bdf"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187064, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "af65a0bf31bda517399819f17b85aec29055b0036480a04571adc66104a5cef6"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "cdccb36a4c1f57d5859f5fc46466fa7d3c734fdb20faa2c094b397ed0752e3f0"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "a77d0fa62536fed8fbbd92affae0670f75fc1883205bf7b37839f10e5c1604d2"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 160016, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "ed6022fc40c11c91fe6cf0a82e40f6ad1bc173d3d42383e39d1fb5701ad33013"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "a802766014396efb09b646d6bec11a675eab930bcc4ffd0abc7333f343ac9ad4"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 154384, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "990208eb0a41975334a92f7d837a816fd81629f162d73a3cacead04b56b88126"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "25a4bea79d940869c65af2e78f1b7cc715c81f083ae424e97b69d4c6884c557a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 191672, 512, 2, 32, 2, 2, 16, 0, 3, true, false, false, false, false, "1000ee5bd0f9f7cb6431a25053c4759b68004d962bfd0601163d17b24190dc46"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187064, 512, 2, 32, 2, 2, 8, 0, 3, true, false, false, false, false, "354e0a51fd4d45a6ed5d843f967cdfd9a62f57fd35b833de2e37d880e0f68e0d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false, false, false, "77c5a9035b6e83e1f6c59b5466f40b28ad5a6e48a10cd1c6ab504c52b4a4fdba"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false, false, false, "af2621d751e2c94dbdc7d22462343b8f221979f77c1c06d17e2d2a3132aa5a2a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 160016, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false, false, false, "21fccbfc13514cdfce7480b589b4d1da7a187aab01337d66e2d2ba9909041265"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false, false, false, "6027ae0700275bac304732dfa4d48a8658a444c8b32cefaa15cbd97caa716434"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 154384, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false, false, false, "0f5044f0bfdf7923d09a0d4fd348ed4916b828f66b820d23e4b4dbafbf951272"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false, false, false, "679c4e47f17ebcaeb1073d0576e5f8f3ed658e91015ae128781ffedad639ac6e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 191672, 512, 2, 64, 2, 2, 16, 0, 3, true, false, false, false, false, "475b831336b7123b64bbea9cef05b2d51594b6bdad2a933e8d5b3a33acac401f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187064, 512, 2, 64, 2, 2, 8, 0, 3, true, false, false, false, false, "119e2e7ee2c2c423066eef112367988b691fefe35771157a777e4cead58b30c1"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false, false, false, "552525d06606b513464242c2369c5122a900c0dfb5479a88c9cee669dfffb8df"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false, false, false, "0c4fe67e3a2376fd00e3c882a83c83e09f34d0587a642fc919ff6d51eea8717c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 160016, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false, false, false, "16f0546067830150119e35ca4f439b0e4788e237d79db009e814d96c8bcb55b4"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false, false, false, "34809a1a79cb8d6bbd346bcbe238eb54ed00bb2a62fac48f164a1243dc42f87e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 154384, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false, false, false, "cf9a146835e1a40f632c8332a75b311f0948d8ae4d4232128df4e7eaf82ed09a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false, false, false, "5e3a0b50a3f12ba3670bf1b03d0bfec95f6509c6145a0506d0b35b1a2d979a41"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 195256, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "5affb6a10fb4537113e4448b7cca57557d1c67208fab4dd26fafa19f83de75a6"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 188600, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "cd60954b24b6e0bba4fd4b29aa8ded5204d45f129925e53d3b9eb409adf76e02"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "25b945cc9e4cbc493a9e92113db939a24e5429f50c5d1343647ce655e57ce571"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "495a0fd5e7fd55cff83956a5eae18e70010a1ada30860c71cd18d8cf8a163037"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 164112, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "7ab3abde357a8946b4de1a3ef5f0a8b191e31e0b81563096de636419dd05ff8b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "b2f79fe40050501101675c924342fd4264d960f76e34564372fc4af2cfc6cd1a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 156432, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "e353bb11f652fb83a5cb823b12b53f6a0b44c3be0cf78dace7f88c65da7c6e55"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "3ddd62731f9640eb309c6863c23a3dd00c698561190b59ad8c9d8d024bda49a1"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 195256, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "cbddc62cef7b41b95e143b0151c185c01ec24f0e6186a3f01d8fd6a329c5755a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 188600, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "af2e5424db6e2dff412cfcb940f2338c89e61ba081bb147aea4a5b25071d7544"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "5eb34c2b7f7c4c6500c752d033f3837f59725ba78cea164618bc3c1c46ef7db3"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "26b2274e9c4c79b701f03fa2be9d675f2d48400fe549fa1f1adc3fb9963675d9"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 164112, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "c50f4dc1fa23512b59b59cca288ab050eecbde73ba3618b64d9e4d1aa72ad6da"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "37495d0323fb855a8956dde4bd74227ec9420c13deee8dd9bd266a1d96c529ea"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 156432, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "3504ada3ed7acaf9453f7919d4325dda9d63d8fcc474f37882d0928828aec84a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "03004440710429c36b1ce5436ada0df93a25127cc4e1b85ef2533b847666c402"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 195256, 512, 2, 32, 2, 2, 16, 0, 3, true, false, false, false, false, "70bd2a5436c00ba0a316b7ec09d37f0b7daced63bbb277d26191522c246a9f74"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 188600, 512, 2, 32, 2, 2, 8, 0, 3, true, false, false, false, false, "7b4880063c32d6f6d637a680bde54a523dda078f8204317999fd61f7c1b49177"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false, false, false, "b717791c919ec3e9a5b4704cfef5d608849d28c53f216c6229c91dee20467e0d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false, false, false, "d0229206fc919f01400b281eb423cdf0f6472010fe8eb02d29fc47d3cab4c32f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 164112, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false, false, false, "bb81a50775699fa596fcc3582e4ab6c651fbe0c4fe52b222f686fee6a1929785"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false, false, false, "7543df7e51fbe19b87d5d318694593706b10b3d36ed39fbafe80fc744c1426f8"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 156432, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false, false, false, "d50b648f983c2c0c2ed751143a9aa4e80e5aa292f1ca9b742718a78f2cb6e8f8"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false, false, false, "1855766a2fff92aa739eb79f3821cd95b70ae798cdb9dc2e6d518c2a1e920dce"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 195256, 512, 2, 64, 2, 2, 16, 0, 3, true, false, false, false, false, "226c6db3e9e762cd7990d46a47a25b915e8f71df5c2031c429afee83affedffc"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 188600, 512, 2, 64, 2, 2, 8, 0, 3, true, false, false, false, false, "dfb1ca6bf3f942074ccfd08807345b28f855bdab0cddc02b6fd8bc05f9f8de2c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false, false, false, "3a271b95c841e60f69854a37fc8ace9c12f3e239b11aa37db1096331175bf748"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false, false, false, "2cf54daf6094044f87ad1854011ee49225fb8b863a8d75cc987a1449dc39521e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 164112, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false, false, false, "bf83c8d953d571175eb857766f1d55d76e9d5b180e88c9f80c908e85187b32c3"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false, false, false, "d2581978f9cc5eae715a86427bc7d174e79edb1ae87f1b5ab6a7f2a0d2bb5b1f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 156432, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false, false, false, "9080b2d2d4e00aec7d1d03108edb8cb7fc72358b584a58b649d8b516654bfde1"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false, false, false, "1a7377308a68046725fd514a1574790f2239ebb24de5572bca14bf14c32c486b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 191816, 512, 2, 32, 3, 3, 128, 0, 3, true, false, false, false, false, "12cb04ea1205a0e64ccd2c6e0db12c1a78a744fdaf1e5577bc5b7d9d19e3eae3"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 156992, 512, 2, 32, 3, 3, 128, 0, 1, true, false, false, false, false, "d7a6aaf8d6b15e437cdf8ea2444234a69e9642367c01d7d885227c5121de97ac"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen", 165280, 512, 2, 32, 3, 3, 128, 1, 0, true, false, false, false, false, "e8ca6ddda093d06fbabd963a272a121b75ffe4c260767f7ecb578101696e907c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen", 156976, 512, 2, 32, 3, 3, 128, 0, 0, true, false, false, false, false, "105366702a1cd03200a21260b5e51ca5e14313c09babd5cddfc28182b011710d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 191816, 512, 2, 64, 3, 3, 128, 0, 3, true, false, false, false, false, "c8c0b3e248de9a6e3ede43185ae3d9f49b8a7f3c63963bfdfdc7d53923b15c28"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 156992, 512, 2, 64, 3, 3, 128, 0, 1, true, false, false, false, false, "eb39bbdd51e2cd4df54a2a756a9e9fca9935104118b427a94bdd37b65decff11"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen", 165280, 512, 2, 64, 3, 3, 128, 1, 0, true, false, false, false, false, "a109b6a6a52e3e94465b5bfc345b7ae4d519f01820998d208435987551df4ca6"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen", 156976, 512, 2, 64, 3, 3, 128, 0, 0, true, false, false, false, false, "ae0df58343bc0253a1b5fa5e35873da21796ac7417782156b7cfc0f1477f3bb9"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 190792, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "87d31c3bf1f1e9413e0f6cb02e502d8d4a7b81a884cd3040bdccd644812ba668"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187208, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "fd72529fb396a385bb8633406678468c26056ee55a7256504532fbbb0af465d9"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "91556c6801ad8b8e5cab8d41a3c7ad39b12e5cadb7fa6fc0798cca35d581440b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "86fe258d11d9eccd5ece051b9d64f8445da7b562d2a2781aa1e261a61f71cafc"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 157088, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "6d54f71dc67e005eca4bf81a0f51de9abc5481b8a5395bd77f752313e0b5a55b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "349766efb40df3d7fdd8739d81cb615dcb19bdc2359fa168268ed3ce1cd31b09"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 152992, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "0cf05d4fa3ce78030da80572e3973269e341de171510fde17174a9bfada4f7eb"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "8bba587c46f50554b76897865b48dc97cedcc4ee63072a13201e6f74ebbea44b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 190792, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "1bad0930b06e66cda02db25335fadc3d7d7e328d09830ddbe991f9d40a713d10"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187208, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "b72cdec59f7f544a222eb88595e8945e8afb028184e48ec76a4d835a348133ee"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "451773d36d1142463e24a22bbe256ec16c26d18b510e047c98b34526bdb356f0"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "809598f020cf49ee5898f9e5bcb6de7df63d0e27e0979c5f8b588bcad5b69c83"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 157088, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "db8c5e5122602da9e2b7bc414aaff2ebcd2cb1b5c91015db8758771c3c3f77dc"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "da116c28862f149aa9fafeef350422876271ed715ddb66c38ff211ea1b238e91"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 152992, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "4465c19385b3662486aa846fb5a2e4cb9e956128a2695aae45db13a7b24fe35b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "55ede767d3980f4ba3be17fe4428d1efb9543ee2e60c93000a8f25fe24d15d50"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 190792, 512, 2, 32, 2, 2, 16, 0, 3, true, false, false, false, false, "35fa36bf9abeebb5253cc2402e069437b67f1bc0835177102657e012eb02ed61"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187208, 512, 2, 32, 2, 2, 8, 0, 3, true, false, false, false, false, "0bf6fa6e316e0441a272a6ff49e6cbcfb792eb5f22d0998196580f7d15cccd20"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false, false, false, "fc55067d9bd2f1459475346279836ae480d1dfea53a5c9066b743d2beab4d9d4"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false, false, false, "5ce8f9a1fcdda0eb60ad8af7842cea5982de8036a1e2c062d696269ed4c494d1"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 157088, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false, false, false, "a744649583696dd26f0e5f273b3f92b60eac7e19544b187a56bad15f8296835e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false, false, false, "f9de7ac9056e484936ffca04b15b14d0906c0baa8c1f2298883686328e5722e9"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 152992, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false, false, false, "6292893b742157c855a5206060550b58b9f80a4a21141dae72798560c3cf230f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false, false, false, "04579d05a069ffba0e745bddd1ad049ea9f06d560af7f96ed8c6093fe59a44f6"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 190792, 512, 2, 64, 2, 2, 16, 0, 3, true, false, false, false, false, "5cea38cda2b1633ea06d5b6bd66e587d3da9a1805436ad01f3c5aff21936c21e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187208, 512, 2, 64, 2, 2, 8, 0, 3, true, false, false, false, false, "eb1dca3297949129b08eab3ec891a2b18f10ae6f48241a86d6ef9353add14c99"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false, false, false, "a0b73640214c3c1659461c8793782f035b14c26f6078cf4a175ec40d7869285d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false, false, false, "d54215da901850c8340a0c13c73d401b586ced6552c81d12d915a9ceeb5a2560"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 157088, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false, false, false, "599632452110201a104d9bc9b6ac982d585879e0fdfabd0c5505531029fef4d9"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false, false, false, "cdd15113550984d574ee01bc769c393ca53847c17587d5f3cd72a28ddc8c575c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 152992, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false, false, false, "9d815bddb37ddb5632b5a80c23a54896b3e790a3b78d20c9c3e75f4cc3842ca4"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false, false, false, "d7df8437ac72c65a8357c955aa7775742638d0ea8e25801d3f1fcbe101bc2eec"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 228408, 512, 2, 32, 3, 3, 128, 0, 3, true, false, false, false, false, "eeaba6fea525cb70ab767b5999f84c348a5c4c7a0853a0d07d6b881c04fb78f6"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 165040, 512, 2, 32, 3, 3, 128, 0, 1, true, false, false, false, false, "4256f5f5c27100ae2465dbae72b2835aebdaf180342373a944455b36f2699508"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen", 181520, 512, 2, 32, 3, 3, 128, 1, 0, true, false, false, false, false, "089401654a02c6b3cd5bca1f67944775af74182443dbecdd0876b6cc8436f932"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen", 165024, 512, 2, 32, 3, 3, 128, 0, 0, true, false, false, false, false, "d60dca9b007cd952b311f5acc85c173653a8867d5b0fe7fe8c9adaf297bf5646"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 228408, 512, 2, 64, 3, 3, 128, 0, 3, true, false, false, false, false, "dc04ce615c45df36f8f435002fbe4cd7c22844731e18d105364711ff84bac851"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 165040, 512, 2, 64, 3, 3, 128, 0, 1, true, false, false, false, false, "41dbf77542b1b645096fb7d666ba6a760e170b26971e823bc3673ea8c5389011"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen", 181520, 512, 2, 64, 3, 3, 128, 1, 0, true, false, false, false, false, "6af2cdce640b9b39244340ddcee7a9620164308cb2435a91643aa8858ed634e8"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen", 165024, 512, 2, 64, 3, 3, 128, 0, 0, true, false, false, false, false, "139e1dc1fc639c84325c1bc223568d230747b07fad4a7243ae3b410e6131453f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 191672, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "87b4115ca460a4877422561b7334d204b05d42ca9f4111415c6918b4698da30e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187064, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "7cd8b316d48c662f9120856f9a3c68b2e70bfcc3cb0bd1aab7fad08ad03b32e3"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "1004b95f834cbac4b90dd2322771e8dc8a85817b3b9bd5e3ef06fd0baabbc848"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "9417b55e45473a4708129e2aaf4f9d1066c9ffff97b46966c3c7f5b386b7f0a5"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 162064, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "b85f6eae8fcbfab510168181965b6edcf738efdddc7f8ab653d9681fb8eeda69"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "6c2394b915e306a35c98cacc75dbbb5b9d61aa8869488905eae350c74e6a8b7a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 155408, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "b11621b6012fa2f3651e459b9662c8cdc69a33657987d061c0bd18ee3e291858"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "76373bb94d42269c1df5f8e9a3c2f70eed993231a7699969b3aa5563b58bd888"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 191672, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "4398f42be31868152f36e794e4e71601bd2b0f7a4b9a0d358cd4c41d6b65ab35"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187064, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "b8114619279a5fd3c3c6a0269a2ca6e4e2d49ac6023367aba31745a9add26d81"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "2237c70e3775a24cdfe94605ae939b193c7ab7b1dcaf1f90f4333ccbd6b2f57f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "d340f3839ad230ce62aec254fb293eef12935ce0becfc2a7fede1ec6247648cd"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 162064, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "ef231ad3009eaededdb597101a82ba085adb7f5563699b910866b670bc7939db"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "fc850fc2f25df6d367e2b30af359f2801f541410550f87e72f3ee3fbdfc01b1d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 155408, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "a206dad34a5a87975bf1ba725cb39b452306c58655ba96afa96fcd9ecaaa577c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "2f4b88520914deb3d84f53f7406d1dbbcb915be4eed5a57909b341c219916f6f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 191672, 512, 2, 32, 2, 2, 16, 0, 3, true, false, false, false, false, "1641a25deb0a4c0dd3b079a4db029b122d42a158284d7d344d6a677609e3524d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187064, 512, 2, 32, 2, 2, 8, 0, 3, true, false, false, false, false, "1955ec43c4160460bc251dd1b5716bfd4a1b4429e954e77e29a467704ce9d84f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false, false, false, "46c6ead91ba4ee7544b901edf80f5b359749988d77e34a3ad029fd0245eef328"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false, false, false, "99595cf95edebfff07fa204b03052229e2bf5a8df355c61ac6ff8a16e390b44b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 162064, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false, false, false, "29c04dbca7f23c79190e86302b7f65318026c85c9cfa2811d9b2754d27c1b180"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false, false, false, "57708585ba641bb3064aeb5685cd04405db2594c0f421d70fc2e82566ece62ed"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 155408, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false, false, false, "2b31b39913bbfcf7cca3935a79aa34a5692a7f35b2fd50ff8f52f8b07ef5e21d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false, false, false, "025f8ed0c686be03110aa7b725006379a11ff6ee88a81e98aa68ff461f244845"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 191672, 512, 2, 64, 2, 2, 16, 0, 3, true, false, false, false, false, "c0af1f147413f7fde74b7ed0fa6a499eca678eb531cc22b36137dfb8c9dad92f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187064, 512, 2, 64, 2, 2, 8, 0, 3, true, false, false, false, false, "45eaa77275984e4422bdf96040a1d23f3a1392ec659795175fc75b3091972b83"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false, false, false, "5914f4645d5029a1f5cd5d43b024a4bb19a2deb6ac1e674c36d8fb820e90814c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false, false, false, "5c9b5534278ee0d1924a640395c1ca1125e4a737a65229fab65ccfa106ff6d72"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 162064, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false, false, false, "51f6ce6d9d19f534c9f50e861b60d1c4576d677f8c7a5792a694277a4edf315e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false, false, false, "34e90f0039a257f9befdfbd4a3400dfdb7bd19e140028edb3f44474a93e42db3"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 155408, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false, false, false, "9a196a35ff9de397dad1530143bcab8371ff1dbcdc26596e154ef618a2382c88"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false, false, false, "07783c74e2e73166ec602be18a9ae930dc7838d1b86003a704aa9085715fb503"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 195256, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "49eaecc0214f5bef9e28fed2bb81c321fc6c5ca46dcbbc6b24f903ef82242a00"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 188600, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "70436793c1a533109a4c86e1d6da186343a7743ace0801829e7183766edbc638"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "7ae5aef8921696387c0cf187bf5e2570e1a0db69a581c0f4802ca9cc2e8c287c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "bd291fc5a330d32cf0257cec97f7bd8df1385fc3bc8760e5384203990be08f91"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 166160, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "2fdc498324fa372bee39e206377a64fdc1c09d23003e0a2f8277abf525e72153"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "1177b2eb3331ccf7460c354d421065e9ad5599b665b2a0d4e1a49fc3ce794e11"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 157456, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "8fa5b6b34722dc73723979178e615f9be8012270dfcaaa9c31ed3c755d0c87e3"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "f2f29ef42db036da9a441421803466d73b3fa020aa5f6017faa8ebf6c1652c4c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 195256, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "f16546495c774274a5a43752e7cc2f2df7446df8b16756ced502ad301e28732a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 188600, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "b04491061d7f6dab3132c1b8eb471bdc74fab7d44e32b3703bc18d5a63190004"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "af5abc35c9eba9b47ad2ebf173453a8a386076bcab8696c5d7abfe3c5374af19"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "780b28ee83c86c7e2ce7d519e15ab4edfe2664f5226ed1d0597d2ab9dff8524e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 166160, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "0f353b6c4df9a6f9e9429cfb988ea960fd3266f50e61261206a4a231fe3baef2"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "b136dd05344f1cb9cba10c41a365c3a345e62260633f06c421390ef2d2ea7b2c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 157456, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "535394865b631d91ec07e0b6fb7a4eb979e57da06b6ebaf14ee5547af542bb7c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "5af87e9d680c83933d2e160b686ee8c20e59a4d27d44724c87b878b56784d2e5"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 195256, 512, 2, 32, 2, 2, 16, 0, 3, true, false, false, false, false, "c30b00fcfc19aec1550f9affdc60941501dc8183ba205d955088192110164f9e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 188600, 512, 2, 32, 2, 2, 8, 0, 3, true, false, false, false, false, "7595fc0f4f3cd5deaa4656ec42a8f40c6464806af690fe83a8c6fc397da4c341"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false, false, false, "6b030f99f6b59fa582be35042845ef629e4ca53ef1d5a48bc6ed2f80fb2b0610"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false, false, false, "f8dfbd99684a41cd70f5b0e05d3219f0dcec9483a303185a0ab99e346164c2a4"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 166160, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false, false, false, "a4d8d76cf50feed08a1d46dbaef19bc07306cbdb265dd16732c52b615d39e656"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false, false, false, "486f3d363f3cd2f38a8b77b2beb48772a5b13a173703f5bedf025afce2d145ad"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 157456, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false, false, false, "b4d77f4b2680af7163ea1cb0c5a4794c790d1352013d7dbc6af1c7a739e1ec40"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false, false, false, "47c8869d6822b13215294c3fac08a4a00b8ff6f71cef9d9f2ddb76a70abbe92a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 195256, 512, 2, 64, 2, 2, 16, 0, 3, true, false, false, false, false, "b08443d9546b583553f1a4f1bbf949686b60925725e1c4a97d303bb74574a944"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 188600, 512, 2, 64, 2, 2, 8, 0, 3, true, false, false, false, false, "9bc9f1eb8ce6b0dbda711ee592c4e89994384e7ebbef72a8cd7f50f1a1029133"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false, false, false, "a1249e8cd856ad119abc5477c5b629681d180be71df394a1b121f6730cc479ca"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false, false, false, "e114d2b9c8865579afccd15e29211d7c4082a32710c0cab4007cab0d0cbe1497"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 166160, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false, false, false, "6ebd3b9da98cdf4c8da5df05d9bb142e1aecd31e71a02bda048bd19a3297d4aa"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false, false, false, "5af93f8b438346532127daf91e66025b1ef559d104e1ac5f9127b14c0ac26261"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 157456, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false, false, false, "2bf09d89d9aa77031b464f2a197202176823ffa5691683172c2c400996b7576c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false, false, false, "a202d2fc4a8385960ef5c699c9684c11038831656f3420818ba4ae7227dc0ec7"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 191816, 512, 2, 32, 3, 3, 128, 0, 3, true, false, false, false, false, "9fbdd9abd5cf7a99dd835b47e2a50583b5126907837962e6497f7986a14de6d1"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 156992, 512, 2, 32, 3, 3, 128, 0, 1, true, false, false, false, false, "bf2fb480bb19c7f4652758e59aa3528237bec05c984737a4b2cd27bcbb5ca43a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen", 165280, 512, 2, 32, 3, 3, 128, 1, 0, true, false, false, false, false, "eed4e81532b5f9bcd9d3477c26cc5d1768cc23caaf32a41b2995001d96035375"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen", 156976, 512, 2, 32, 3, 3, 128, 0, 0, true, false, false, false, false, "7a5ba42336f009cd219a7ecdf5facb1e08470bb2c8f9cdfff8058cc99754b524"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 191816, 512, 2, 64, 3, 3, 128, 0, 3, true, false, false, false, false, "2cd9128007e7d556f0811968023a216843433f86d70883367361c739dd6e50ba"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 156992, 512, 2, 64, 3, 3, 128, 0, 1, true, false, false, false, false, "fe7fca32abc642fd69f74ad50784cb45e30c64c644baf0db9af6ae917f701b19"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen", 165280, 512, 2, 64, 3, 3, 128, 1, 0, true, false, false, false, false, "9da0a4904a71935d6b624b6e049b0f6ffaeb050aace692a80dd3e0656c793f8f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen", 156976, 512, 2, 64, 3, 3, 128, 0, 0, true, false, false, false, false, "3be7b25ef9179f28e225ddd4ea5a4282c9e7729dc795b2c577f7bc60c8853e89"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 190792, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "77f7591e5ac574a47f6a961c1b6303fa02a454f4957b066d4d140eab54fb5062"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187208, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "fec3fc11428734bfc86f8704225b9bcae964fd374a65dec92331e81b6a4b1a2c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "8db1071d082829300ee8c4974d8bfe2d16bc0ccd0a40967445e0ed5371c11710"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "6b6ca2eb265b1241eefbfb6ae3e9022606f99cd1fb18235a1ee4a6a32a57869b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 158112, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "7feb8df77a83c09ef2bb35026dc8dcd193a41d259f6f9132f552dd52aa3bddc2"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "995eba586268ea1c7d763d84593ad0c2a40bebf01926191e3493dc5142220ec4"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 153504, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "d3ce086729f126237ad5ca8c089ffe47c4ed54437925712c07d9530afed27daa"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "44e718bbb180e55f5d6675236d0a7f9aa6198aaf82503302e769e3fb9058ef5d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 190792, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "dbe93404cd44f27335a694a0fb735ebb58dbaab92e4891a081d0ec1c77164c7b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187208, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "778e8f73f4662e7bfe6008d4a587dd909ea30cd3fe5fc58a06fc5f77efc1035f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "89eb09671d50267e8a3ac91e4a8ad0e26f8bd4901dd5a9f2679b51231ac11b45"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "c3640b6eebaf348a0a34efc84be4c6b0cd1168a865d33ca1de1587f3b480c512"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 158112, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "30420e93729ffed8e08732d1db773839a5e1687f464346a21c88dfb6fac609d3"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "c94399b535a69e2890e68c2e0905f1a339b04168ce751d776251154120c1cfde"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 153504, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "457bbe4919242232343f56a30afd83770c1e36cb83072e7cc7afef06509627c2"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "f062157c37b0ddcefedd761e5a7b011546f359c5b8c5977216d1fc41c762dd1e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 190792, 512, 2, 32, 2, 2, 16, 0, 3, true, false, false, false, false, "dd0261715f5093b1d17d1e2871b1e49c8c0503daacef91f5333abeb3feb584c8"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187208, 512, 2, 32, 2, 2, 8, 0, 3, true, false, false, false, false, "96ef9f8d6cde14f3c72faca0ee768cbd926b427beb041bea3c32ddedc8e87500"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false, false, false, "727bd57a04c5b6d6dff0fb5c5b4f4a8ce783404c1bd8517fa00e3204e2f97e56"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false, false, false, "b735b150347aafff12b81f456bcd8e036e9a1c82c77dd7dfff4c9ff727e33cce"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 158112, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false, false, false, "b7ae594d3ed3ec9ffc4546516b63ebca633c1093315f3936cd4159d264a8a846"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false, false, false, "ad0d52c60921147cdf069d5249d59b60aa5ada9fb74eed5c0c48781b4a9cafcd"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 153504, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false, false, false, "182db23b0fbe9ee542c49e56880e7e57d03c30b269e18c3d963730dd514e65a6"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false, false, false, "652cb71098e755b4607bd44a2447e948ffee37c5ffe6472ca5bb8cfa1fc18bd8"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 190792, 512, 2, 64, 2, 2, 16, 0, 3, true, false, false, false, false, "6bef5089abee8668e7d310c9bdacbc578a45affad6bb69fb29d92f5bc99855c8"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187208, 512, 2, 64, 2, 2, 8, 0, 3, true, false, false, false, false, "c8240c0ffd8b67eab5af0c1c93ecb48e2234d4feec274da75dd925134ba26c8b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false, false, false, "f60c9cbb880d7dbcfeb436b3485297408dae87a0a2d0a71dde3a063c9f72f395"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false, false, false, "04f8c740875b512537e7cb5266f58ba3f4472683cf35ec7547728001172f28ae"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 158112, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false, false, false, "9935e2c7a175c5acd427d69197296aca405d69caf9b706e2af049e2f799bbb99"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false, false, false, "5c4fca51ad4f3dda7e207e88e8949f8bb8132aedb853e35bbcb61a23b05c4876"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 153504, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false, false, false, "eec10f431301ed6d68e517a4b7c8501613b6554f4ea6002d4bfd395d60cc9eb6"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false, false, false, "86ec86feec79290480ec5df4e180c1e6eb860edbe81d1ec0f94d0b540eca67bd"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 228328, 512, 2, 32, 3, 3, 128, 0, 3, true, false, false, false, false, "9ccf94747545de79b4257fd671e92c91475bd109df41913b1d3e7dfac079b886"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 164960, 512, 2, 32, 3, 3, 128, 0, 1, true, false, false, false, false, "182641eb20ac5c2e613786942d76428c95d113b342a27e6473c711cacbb06df9"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen", 197824, 512, 2, 32, 3, 3, 128, 1, 0, true, false, false, false, false, "4f34c9518989e1d2385e14f53efcdcf7cf738e10c0b214ef08c8d475268916a8"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen", 164944, 512, 2, 32, 3, 3, 128, 0, 0, true, false, false, false, false, "1fb715154df7a62a4320c9080df716de2174e8d3f92096f6b1aeb37f0eae1bda"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 228328, 512, 2, 64, 3, 3, 128, 0, 3, true, false, false, false, false, "2cb883d211fccbd30a724fec581f5332ee3d56443d8abdb0da9273b3b8c0fb53"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 164960, 512, 2, 64, 3, 3, 128, 0, 1, true, false, false, false, false, "9fb46fd2ce300717c258ec140278d25138ee5a6d998bcf0a18a2a76ec981a48f"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen", 197824, 512, 2, 64, 3, 3, 128, 1, 0, true, false, false, false, false, "bcb4db62ab6719b8186e88b0abeec89a400cc5f2c6edc4366040abf72f113021"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen", 164944, 512, 2, 64, 3, 3, 128, 0, 0, true, false, false, false, false, "9c8dcd91cc8f5818b9e523e37d4b0ba9278883f72895d810003f0568307d0d0d"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 183400, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "8c52805fd89aa6ea6da96fde7a360ae7dc3497ce2c767ced3829861b38dbcc20"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 174696, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "bdfeffa725459f11363966eb27d70069a8449e76bee11ab1f004349263b252dc"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 149600, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "3b7e351e99b5cc2896efeaf7352bf011c44350ed3e6f4e7dba4aa1a84d6513bc"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 140896, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "2169d53ab8b87d6e053637b9a766e13da6b4b14e1da7e7321a05bc8200d1403a"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 153792, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "c7b8f8a8d89366aa9ba1f9751c997a57d524831f21ada80fd3e6ee12cba5e484"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 149600, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "4ebcf2d6b743508423a1c19c52ec502a11ceaf2b997483d4cb43f4dad81fdd7d"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 143040, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "29c144d99228fdbcadd86dfa0120b0bd1e0e1875a74b9c4a235771e323cc737c"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 140896, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "230c543154402823509c20cbfd456b1cecf05914923cbf4ad29e32dfdee7ccbd"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 183400, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "87fa015dcfa945308865c304dc1bbd537e69d68b929940b883981e1c35b8af0f"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 174696, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "0774f283f9a89fa04df5b57e561460ab108963d82c3a3551cfd810d43b7b7892"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 149600, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "4f55d799fdd362a87f26b2dd111d611980270ea4984956d45816cede5f43439f"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 140896, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "bd8268aae6a85789cab10d9fc4f2e0bc4bbcdee08e5cbfc188c3a0764127961c"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 153792, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "a7f6b45cc48f4244fd67b28db27edfd4685ca912174f3677d0f3cef7e6a93b80"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 149600, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "7dc3ac9dae1e073709e299e88e47cafe9478ad618930785045fa0acf138681e6"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 143040, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "f38b469bfccfda22e8f335068ce9c7a7b55b8ee165492e82377589b74ed99bfb"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 140896, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "8ab1c905671bf3ab579e937ec58e6e7f2387eb649136ea8c6deccaa04fb00ad8"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 183400, 512, 2, 32, 2, 2, 16, 0, 3, true, false, false, false, false, "b6638a471f09fb65c8f34bac7aa010f80c141e69d027767df809533c34c848f4"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 174696, 512, 2, 32, 2, 2, 8, 0, 3, true, false, false, false, false, "316a46a33b0c9fbce73cf34e0c60bdb04784384f47d9d84262011ed2c25c277b"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 149600, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false, false, false, "4944285f0a277fb90eaf1899e62dd145e6a89f95fb9059c4eadcc4c4b90296e2"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 140896, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false, false, false, "3288a4266f6961d2ef6308783a784436a3231cb4470318339b9c7fd097fecdb7"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 153792, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false, false, false, "f41b6fa8ec1f56da8371bd92b957797e27892193d243217f4288e8292941ff5d"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 149600, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false, false, false, "1a9e34e44acae43d3b0494a98abfa6f257220d16c46f1002f5885f9c7fef6002"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 143040, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false, false, false, "528847cb7ec54aa25fab979c8a3b872590eb1ccbe0ee039240d67c5311d8f4c9"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 140896, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false, false, false, "0f39d9ebdafae344cde326cbef7c977a37c8a8cac0836673235ed6fc919cc284"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 183400, 512, 2, 64, 2, 2, 16, 0, 3, true, false, false, false, false, "a53d1428ab650e4a4a0e2147c90093f2cd92f0ff79696ccd85d99c943ab0e68a"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 174696, 512, 2, 64, 2, 2, 8, 0, 3, true, false, false, false, false, "55137cf5b1dc74f72714070b500a90d089ffc9c9758bc7bfe23e616447e25aa0"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 149600, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false, false, false, "0f7cf161327150d48bb1c336ee4be582eb361c990018723a4f628cc66b284ecc"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 140896, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false, false, false, "460fd548b29cb47b45a2c696ead62cc2b484fee6f8a0cc0d22e64dbf389dc74d"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 153792, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false, false, false, "9f38e033d2ce8f76a1f7581150eaa91e13cbf4c37fb3dd20de0daf7f6ea9f118"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 149600, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false, false, false, "96720e8665a8dbf64a2e3e069deb6d82c0bf40d247fe9256971678065cae27df"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 143040, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false, false, false, "ca5e80b5d5ab038a5b2ca69d96ff640dc413da14ed45f60d6445abef341d0d1d"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 140896, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false, false, false, "42acbbe845e33f78c1e40d920b97bfb48d448175e3d0840d7d094c9c61e39574"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 191080, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "821ad1981854714ae754c8509ea71ed7b01515c61c0b31de7c1e19f4afd76598"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 178280, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "a6cf5e55448f57e63d2746546b72f8ef2c358c636e88d43856c948f2fd8e2d47"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 157792, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "acb2e8ae4bcea0b04a384fb57a0cc73c2802af9c9ee20b7f3a0c545771d7d048"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 144992, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "73b650a49732e42fb6d83526194bb16a65a0d2be6d5030575cfa2658dd3b3136"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 161984, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "70f3f75cab63dd824ff6d6bf895457d1166d8f7511518def56f48a257157d5ab"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 157792, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "5d26a17bbdbca8a957dbcea7f9023207e1401a33191b9db2e57f29c67c7f61d2"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 147136, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "be549af1f43cfb6b406a72992e9e5dbb60142bf0a55cda43d3b8adb62573a00f"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 144992, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "f2948cc24a4954678eece9c94a9083540beaab586568bdbe8e10c48b06dbbc8f"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 191080, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "ead78ceeec575730613a1f4ed56c40c8562a73e4cf5daff2e92badaeeb3ae6f9"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 178280, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "4c0a4a418485d33989f1b6ec5cf6d76b70326ca09509a5f7ee89248d45f6d5f1"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 157792, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "801d91bac1cf7e0ef3f1cf725eb714f2f2a5ed473aa30b987cd315d7251bf096"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 144992, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "6f115926c2e4dfbaec8ee88816a9bcec7b45a3d5685cd12e4cb2a39ccfb0a98f"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 161984, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "d0f9601918d7b962d46e83b295596582d9bd33cf0d1f9bb7ae6ea34a7c6fada9"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 157792, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "6f01c731abd80ba83f139b8e27b798fd9f2805ad3572369585d9937dd046d7b1"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 147136, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "32e0e4d173f43bb1cfa0073f59ab35c6b606e1c419bed50ed3372111451ad37a"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 144992, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "b36225dda9fe5c4cbf30a335674cc71fbee86baa3f5012e33ba760db9a3b899b"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 191080, 512, 2, 32, 2, 2, 16, 0, 3, true, false, false, false, false, "bb416dca27d7e494114971e48c3ae4282f0043ae526e22f28a316807ced8579c"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 178280, 512, 2, 32, 2, 2, 8, 0, 3, true, false, false, false, false, "6df75b1cb2dab6f0113ef8e6eda7369ccd139fab8f109036373aeeddab8b1ff3"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 157792, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false, false, false, "50909b1d8e5646ebd23ae7af794c4d44fc4f22973f36ec0c7d61869231adcb98"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 144992, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false, false, false, "3fbbe94abf9b7b7f63c26886408595863da12330774167e1b55faa08d7f9cc7e"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 161984, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false, false, false, "4a03907230f5fa458439e610e5be58ee92ff10fe87b40d1230c18d8c9e22a05d"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 157792, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false, false, false, "e615c26f7aaeb165b8e8c648fdc4314a8a8e5d759185aad0fa31956f967467e2"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 147136, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false, false, false, "d29c4c4182aaf7a290c7f7d0c8ac970a067d81a3695ca7a85cf9a72e566b974e"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 144992, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false, false, false, "eb5213d2e580a3119177a4e053ce5784179e7bcfbc6b33d203a8a888533964ef"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 191080, 512, 2, 64, 2, 2, 16, 0, 3, true, false, false, false, false, "677f4c3c1437b284eb34b9845d588a3c11a8b83eafedb9338a4db99b2c503739"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 178280, 512, 2, 64, 2, 2, 8, 0, 3, true, false, false, false, false, "2aa945056fec691b9b5903f22213ca344ddc4ef8b9c2bc248da34f64cd1abc6b"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 157792, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false, false, false, "75723738dd885b2fb9977c3facc00bbf62d43aa0d639751d61e032cbd894f147"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 144992, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false, false, false, "257d22762003dd5123c301dd0cc81335739c13d43f121d720a3179dcd7850ed5"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 161984, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false, false, false, "1363b3a9aec1bf998adbeba25d199094fd72df0ca373012aeaaa11911ee11272"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 157792, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false, false, false, "fe4edfa081b5f3a3923bdc6bbeee1fb6f84abb88a76b83cda5cb4eade4fb16c1"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 147136, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false, false, false, "61b533b424fc566cc6217ea277af878866151369503ef3278d9fb31c2d97e53d"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 144992, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false, false, false, "b01490134b8237b2527b67eb0675e1cb3449b7a2b9bcea9e3b8368eda28b11fa"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 199864, 512, 2, 32, 3, 3, 128, 0, 3, true, false, false, false, false, "b27741daf4b3dc2c518cd321b924f2c79ccaf4a0089ea7236a6ab789dc2d7f4b"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 165040, 512, 2, 32, 3, 3, 128, 0, 1, true, false, false, false, false, "69ac21eaa149195769d6a6cf3540759209cd24fd9e29a0cad85d03ea90d3f249"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen", 181520, 512, 2, 32, 3, 3, 128, 1, 0, true, false, false, false, false, "6e890bc105cbf707132d3231951f226bd9d31526af39ec884e310e0215dfd5a4"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen", 165024, 512, 2, 32, 3, 3, 128, 0, 0, true, false, false, false, false, "c8c16dd271d5934add27639892307646bb54e3ee9caac0b9b01b8edf84b7f80d"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 199864, 512, 2, 64, 3, 3, 128, 0, 3, true, false, false, false, false, "0717e1a3869d1df32505f9aa54dc03b2e69aa283a1fbc80aa1f3de29ba05aa7c"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 165040, 512, 2, 64, 3, 3, 128, 0, 1, true, false, false, false, false, "72db408a6840bf99f11594e8d0f8e69409a4f156dfc61bb993c8dcc550048e41"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen", 181520, 512, 2, 64, 3, 3, 128, 1, 0, true, false, false, false, false, "455dd22753f9e41802f827a1088d84e0ae9f950ede3ac72cf8fb7321e025ba21"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen", 165024, 512, 2, 64, 3, 3, 128, 0, 0, true, false, false, false, false, "dc682a60e0aa9c57b348e60238acf0639f718c73fcf46a43e4831d8f022a78cb"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 196792, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "c5ae1a1be4491c83605392c0bede321955de62e02c4b2a9a96d59601bc3ce23d"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 190136, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "37da47fe73dc7fb45b4bfb41e5272bbc84331404f29427fa151c12d6753f1748"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "e2423ff169f3bd1d7fa142347371c269e837ef7c23e3b17ab959e5e86cb83539"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "0bfb53cd894112eafc6be47dd328ac9da15b177a9f69b81229c575800f965047"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 164112, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "9162be9d1c69258a04b68084bfac6440c945f5b155fdfba25dee6da9e410e8ec"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "fa537fec3f6eccb3340802e360707cf80cee0e75aa9b0ad9b30ca317601279b1"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 156432, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "b8d3a946aa2da9482cb69daf02614db34071f8091c5b6888cb9edc414ba971e6"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "ddf823782723bdb9a633b18f0d9c0f7053e3122924230c0194240494b80a3b84"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 196792, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "3466310d0a52f42e39136455561f0b80ece163f8aa473b053982e2ef5889b29d"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 190136, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "1375010cdbff780835889587802d34c77f96a3e766d7a75754212335ce5c22be"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "e7045b374d11d7a61bd34bf3e59bc5d4658bb955748ad59a50bdb0ebb76c628b"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "4a8af28a5af81b4d89de7e39f5402f7499382ad8e9d8eb11512b00bc174d9684"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 164112, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "4008406992419e8da952d25712da67eb5ac7d337edf7fa333e1a85509cf3946d"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "e1db2ee1ea569c82ba5bc6074c7ed16740365a8c5f143b45b1fcf56d95cea6d9"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 156432, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "c4905ed7587be0042a35ab39c9bafa1e491d19130be168dc56f66c3d3e9ca7b3"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "9e811866ea52b3b85faff0625849b153f94996c248e334e8e7fc3695104e1466"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 196792, 512, 2, 32, 2, 2, 16, 0, 3, true, false, false, false, false, "5f5f481e2f2fbddb40dee009f4fb452d9d52c9789c4a7a68bc6e6c680e7ff118"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 190136, 512, 2, 32, 2, 2, 8, 0, 3, true, false, false, false, false, "d63dd3790eb52a0a7d66f10490ffeac8e334802128e1454cb052ffa22f4c5a56"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false, false, false, "c6dd40a560308b28c5c309c448f5e5ae3c36912d68e2277475e884a3560d1493"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false, false, false, "2e968ef8ca052ca8927c48cf06621f07f48bebada7df450b71451635de89584f"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 164112, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false, false, false, "374c8eacc50c427f4c49869b57e1a3127c10545d5f03b2a05222d257f4563124"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false, false, false, "b6c55771e992dd9a2e8118aacb148d0c4b2a23f1f5b590084865263fb9095a77"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 156432, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false, false, false, "d418c0d6c339aa2cd900aa4f137ada5e4398ea23ffa40fb872c08b513a9a84b7"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false, false, false, "83440b26591540db553ee8239c41de66c6cc316b18cbdc2be83e9e1a124dfc13"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 196792, 512, 2, 64, 2, 2, 16, 0, 3, true, false, false, false, false, "cd04010643d137757e0b3e3d5539d8bc7a8177e8635e559cc048184766931533"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 190136, 512, 2, 64, 2, 2, 8, 0, 3, true, false, false, false, false, "4e044c7f75e710c8789758e4ec5af1f6e4e5de5f10dbf47da890615f7f22b121"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false, false, false, "1e4ff83a06eee63027374707cf29d6029a38951ceb9e99ce119b2e97ab0e0d0e"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false, false, false, "82b2dedc92a260b7138e37d97a070efb94da1d2719b8feebf958acd84c8e3b8e"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 164112, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false, false, false, "41eb1f61966e68858f7f810502d42c30d6cf2fb95f29f271821d250f55bae374"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false, false, false, "3ec454cc65391f255e6dce530fbffee768e753e865b073728235edf63210a3a0"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 156432, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false, false, false, "1f3bf63cc656975642c2c208feceafe1670107da6c64f584dd3b96d0a1d9c9b8"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false, false, false, "285002b3ad35ee6e9ce0e93ddff1345ac30d2087a15eb64aaa518b5770249847"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 127216, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "b4ce37da664e1c8d09ccd8b41d65e9380392e9f0368757c492759792c4313fee"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext", 127120, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "dae405ecfde1a7086ce0bb483a1ad55520cd4bc3b3e2e39e9d5998e24a16b99c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 127216, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "4a0a046ee21a50f35ceefa7ae2e6b4998ad6903fe80f51e4f5b4db085589ad58"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext", 127120, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "361c8062b46b59fa022bccf62db501b5c2329bd22fed4054de1914965390c137"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 200808, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "de476a4e7ec25ac4c97c215c618e086bf2fb6066725cff4a065df4207b7dc649"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 196344, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "c33c389e8dfa4426547850021957807d567b54d373961dbcdad577d9e55bdfb2"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 167008, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "7dab3be35d8e74d50f5cc767b6c9ed824dff3d4fd9562356e974fb6f97756f0e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 162544, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "e6f976c884ed4b763cd8dcb00354da177d2c18be7600e1581ef9b108e26d29e5"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 127216, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "413c12f1020589ec3421625a92ef7f48e42f01792bbd05669ab114f0e4409c47"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext", 127120, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "b90a2492416e941da09f190e2ee48f6a4cb33371b4e2538dba6ebff48d614c64"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 169152, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "5245a3bf26fd1515f05ce53e0497a082bb6f3ead23141d215a50d68adf39a12a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 167008, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "ef95050542686523e0da27d52c19d45b81be68137b91f06701e632f272e65712"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 163664, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "31c8832d1023f66aadad802a5372adfd4538ca26aed384d373abcb8a8a8393f8"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 162544, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "1c6742d8ccce5ef2d81eabbeb8606952512b7dffd8c9a045157de032f798c25f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 200808, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "d581c5223529ec7f4c110eca9da3cc749e038a33ac5b6baaa1ade60aa1dc504b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 196344, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "411b530906f9649de9183cbcf91de6d4bc91b19949fed85e3c78013dbea28e7e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 167008, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "68477883f806efbe60c1ef0484281d3c3d757d50ddea48be6dea8771e6cedb5a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 162544, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "d96ebdbb6ce00423cf6ce0cfa5d88f3cb8b136337fc4f6a373697d9880de54a2"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 127216, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "08fc1866315e1c6e2079fdd338254a20ba33aab55a42ca4b9829a6913fcc75af"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext", 127120, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "cae15477dd94082cd6ee270ec5e2ec5f4f82c770337ab2b50f1299468901473c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 169152, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "650ca39f07b7191377236772177c6dfbd44d958c3b9a17f26181217a9543121b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 167008, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "7fe3aebfc7b8281ddbce97a73ce68501815772d1c1e283d712d88a49133cfffb"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 163664, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "b2b0aef79fb33d27af6fca313f2d6c91f38c378256d02d2bcbd30f670173b341"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 162544, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "3397cc45eceb71d154e18a61c877416516568476cff80257b629f119d01e6226"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 200808, 512, 2, 32, 2, 2, 16, 0, 3, true, false, false, false, false, "9139bf19238378c04d5d0e0230da1dacafc9109625e1de124061ca3aa713e82b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 196344, 512, 2, 32, 2, 2, 8, 0, 3, true, false, false, false, false, "2b2d903b3fd51f2e371d7aed07485ad703ac97714d3cff205c69d2b74c4c921d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 167008, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false, false, false, "07474ce73181c7744875877f7d02511922da050a6310eb8cd645eec7a2788da4"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 162544, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false, false, false, "7f1dc0923c1b3396d15825fa57bcfc59ec9a29f6e80451e5759cf9c38da985ba"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 127216, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "d969b25dc758c6fbafc2c6565770b339a215c27bc529243cc61ab9358682f380"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 127120, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "6e292cab669ea2df096072a16868b20685c6f32ec5b292964015655c036b288a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 169152, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false, false, false, "b23f148b10dab1c8201f33d1566badb8e390cfad0f4d5a1076e211cda02ea26a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 167008, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false, false, false, "21730738f73c5c5b148b6ca80c5b44e6811762dc97e54e4b021b7bf191ac4c41"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 163664, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false, false, false, "334a8662d98ec7941489fded603a2bc81efe7fe37e4f90f6ab0955fd585ea867"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 162544, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false, false, false, "410fa3dc445207cc7fc9e7043fa63d57707468197952a660e14dbc99ac79fc6a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 200808, 512, 2, 64, 2, 2, 16, 0, 3, true, false, false, false, false, "4922aa042d713bcff368dc73c7b85225dc3c70b32dba07fddb07a9e3b6c7bcb4"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 196344, 512, 2, 64, 2, 2, 8, 0, 3, true, false, false, false, false, "8418229a5b6037678bdee35768d57f311e2ed224c02eecb7677dafc0cbd63f58"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 167008, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false, false, false, "3ce76f23d4be064c022433a1d865f17fdc27658adb460b5ad8d3a446334a37e8"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 162544, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false, false, false, "cb0218fe8e26b85a022ac2025760a589341ad3685c82f36941d1f852ccdba6b9"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 127216, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "91e889adbe86e5e0f2c1fe586e50a90d7cf115291933f67ec32dca49f584c77f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 127120, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "b3dac11c4966a19cc994cae3dd6f6885aae98b0fa3a6bf98e7259d870a35e2e1"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 169152, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false, false, false, "7ae0bdba58ccd95b96642fdbcc08c5b0b7fb493f852aaeb781154eee12042397"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 167008, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false, false, false, "554545443eb760d57ccdb2a756a9dc20a268206fda77f7fe275489da55045346"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 163664, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false, false, false, "7462782646732bfaa443db7c5bbfc19e4f475f2e96794527df5d4052ec07539b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 162544, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false, false, false, "57ddaba23e9cfde13c22148b3e61ef40492ac017906faec86081b56ad2a5eced"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 224576, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "589270169dad7709a57611fb9d15ab20e835b5a5ae9e2a87da76fd183c722693"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext", 224480, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "bd81142e5170b90a199fb33df43c6b60be78fa61c3c5e03a0b98354d6471c5ae"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 224576, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "181b9627019eb7bde84fcbfe0de6b5155bf2a88b591cb0e9c13b5e11bd31a5af"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext", 224480, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "070ea6ad709119cf7d31423d509e25e0e7cff6d5e652c01f18b1dec9831cc436"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 213608, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "7eee09c5d52e9a90d292d4f330f84c30a58b3924fe5e232d9ecde7873e94b7b5"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 207096, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "1659c39a86c2a2bb508538c641e0031538c3055471e89d893d343d0ccded2c7e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 180320, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "0d9d6498acaee35e72d070c59eae4cc999b8fba4518eb48b9186cb109ab5b02c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 173808, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "36c964ed340fc22a85ec0b5d16105e5c9a371f30a9ccf3e0516e1dead7d6c52a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 224576, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "e19faf863694e0183bedadfa89be862e7a8538adbb73af3587363145f2b67022"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext", 224480, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "436fe7c7c30be1893e80762c017ec5456ad44dbbd282596592fd277f610be734"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 182464, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "e8e53157e8d2402165a579c860c43c5a61b25b9b5c3e4e5dd2d808e4d27722ad"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 180320, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "7e08fb0d278215481f19e1a7497f7d16b7b026638938dcf5d1ebe91db503417e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 174928, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "f433bb58cfbcb9964ac30bcec20fa61728c3feea648d207658ad123d3313b351"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 173808, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "6de13c035c2d3bc349ef7d2644fdb4663d761f1c9bc24879b0aa5a087172916e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 213608, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "6696e084829754c2fee3e85a600e5f94c0e86ee94ab317b4dad626712537e0d1"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 207096, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "e7c75814cbdca50752ea4ce72ff74a8a14f9c039d67164ccbc22678c8e2894d9"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 180320, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "8d5f4baefcfc1b90d5da3e3671fb25f1de46e66d7e13b9a55130e1381a56e008"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 173808, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "66708df91ef52833448d1f946bfbeb3d03a74858b19a6be855bee67d2df60d40"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 224576, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "f412ac17cff96421a24d3d5f518ed8b0fd3a697f0c502c3fa8546f9f7ca2ac01"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext", 224480, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "63015997e0478d771f5b52fb293aa0d06d1e2441efacab666468750f867b95ba"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 182464, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "8faa0bd0a63abf9e4a15975e50552bf70382054f7a88bf60c4bf662b2c9084b5"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 180320, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "ac9ada1965121f33ff25a2e193c9140ed88d4f050b7f951ba682d3c16aa22ca2"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 174928, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "b05d8abbae51e9ae009b84eb8ce66bd2b4e69334d82253ff9c0755bbf9ef15a5"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 173808, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "e4f84554733422dc595c4dfab3c880bf4acb0ccf732a521fe98eab87bd54a06f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 213608, 512, 2, 32, 2, 2, 16, 0, 3, true, false, false, false, false, "81268a7fe864be3f4b982ec4a796b66cea3556f975e228cbeb3871030d37af96"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 207096, 512, 2, 32, 2, 2, 8, 0, 3, true, false, false, false, false, "1bf4285bca91c05e13f868d00915ca0d0df6912664f43c1bcf56cc2bd0a4313b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 180320, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false, false, false, "6f74fc5cbbc5b1d67c4c7f030657abbf8511d54292cd403f823e4bd9574d04dc"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 173808, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false, false, false, "6826ea261131f29b0e7459d5ca44f40504e0ffd84384436b5ccc959d0ece4b84"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 224576, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "a5f18910667891c8d2f645cedde4ecf0ebba40f1ee119c6bceedd599ca784648"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 224480, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "4b4c4e27d55f7cdb471e603f8433f78235fb6a86810c7cb1fec198350c378e24"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 182464, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false, false, false, "5f2ae94f2395540bd8872f8011757afdd16b08e6cf9130d319798c1e239519fb"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 180320, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false, false, false, "cf66990c6c95224782fff685f5998eb411796a7d49e28adc7baa79126acc701c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 174928, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false, false, false, "fb3b06313bc5e2151beeb9e4806665c0cdb713cc0d8775f8595f223186049cea"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 173808, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false, false, false, "05e63dd8691e841e674ace39a3a35e1b7ee763da37b9890b8a780245e8010fcd"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 213608, 512, 2, 64, 2, 2, 16, 0, 3, true, false, false, false, false, "9c889befd61d27efda91f8cacedb055e0b41c45fcabb88d6e5b16675fdd49ead"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 207096, 512, 2, 64, 2, 2, 8, 0, 3, true, false, false, false, false, "6d03b6d157ca92d3782e79c8858b8f7c28d6d3e41c58b5b9f8c6c6228a35a4fd"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 180320, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false, false, false, "68489ca0d4571f736a5370a52e51da915c60e209db2e71174de74b6410bdfcce"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 173808, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false, false, false, "350058c3fe7ed093e0049e2597e9125dd8f6b420621c8f4be4b22d19ed361e94"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 224576, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "4a2700b7152ad4fb89b7b91d0c715a6fa58c801456ecc9b05e7f10c73f9731f4"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 224480, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "ed6b20860dcba20fc2c727368d488d2c85e74969808ac9368cd25547d640d147"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 182464, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false, false, false, "2158cf81ae06df1f7a7a89b283312e978bbf88435acb4443335213229cff244e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 180320, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false, false, false, "87912b2bdf6339f6179e34b1e9c0470d204f2faea71bdf1abf8df7c9e99da822"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 174928, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false, false, false, "4d11b716399c7647827f1fd6501e0f72846496e432530dbe217ceb3fe3ef25a3"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 173808, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false, false, false, "3895674d8083516bfe06ba2ac1fdca391c66043e52382c5fad921ac97baf0a4a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 64752, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "8c9584c109a73dd9321514e5afa07e78e6eb264e899b644df374ca24955ed4a8"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext", 64656, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "1df866c843a08ad3f7bdb1e0c9c5455b8668455aed3c026cb71d5c871a957faa"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 64752, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "bf26bffe4caff1f85a5eb3cb7eb2a579db727a9cdb965dd93a3fbbb1310a2d84"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext", 64656, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "ed18705e7c62b6a9aaea73ecc488f61cc54ef9714635632b13b8e61e29256539"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 158840, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "50af3b58acdc7438ef0e362bfd3b55505a867d036721b192173ef7f4ec424f34"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 155384, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "4a9185c5ac9107d15e8ecaf2ac56fe51218be6f426812916382633f36662558d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 124016, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "e2c79afc3c30ce50889e278704c2d24686db98b3f8bee6bcb2121e8881028077"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 120560, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "16d3b525a6015c0a6cc97fbf692be2147efc35ff7b3d08987138c6564f1e8d9b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 64752, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "0184ea32e6f67491c2a7db0b389fd1fa395fcffbe332da88be453bd2a54916a2"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext", 64656, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "fdc64ee864c03b283c18db04fb35c93a11ad36b2463202673e1ccc97b04f0062"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 125136, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "23b1a12781197d5d8ccc7991aaded8571c5ba4049a8d9763275b6f43455ff775"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 124016, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "ef47280552f602e9baf279181e844471a9e33fd803dc655487133a2dd0a8dc0a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 121168, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "bdc3e33086351daa36faf88384f8ad1d2be247271f25bda681b5fde76e36fa6e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 120560, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "d84d1ef9971b121b0a4a80741962e3b035f2d1b2d5542cdd9cd8f8c3caa290bf"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 158840, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "5c17b060041843c194c41bb6cb79ec96aa407f29e4988d8676b0ed0f3def0ac1"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 155384, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "002cb8491d4a52d5c8fd28cdf375d0199ef4fa9de5d014d751b6d97c2e357b3e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 124016, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "685125d950358ad9db696c5f93a738769c8be7123bc3e9c396737df9e9a0fd83"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 120560, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "a8940b3f18bafe4b52ccb1e012572e83ab59dc7cfcf15579a8398ba73d6095b8"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 64752, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "0d9e9006de4dfd5f0def250959a99e4144654cfca519df12a7ed0e5eeea1aa08"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext", 64656, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "d6d5afead98163a4d8162005fc81c74f20a1a16a66ee3429bf33022c42930967"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 125136, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "0504bc2c68b7b1674ee2f4f9715f63bb7f7f05ffafb374e1d9ffee2311ab4245"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 124016, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "bbca61241a7d5470c977c6a979f37ffabbababfbcce6b04d4bbbaec8fea886f3"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 121168, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "a3674a44e52dd7bf455d9029d4b41af5621dca2256abebc0acbe02984226a307"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 120560, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "196678b86c35615325c9296b2cc6e724697b593301b9f8d97171dd7fb062ea92"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 158840, 512, 2, 32, 2, 2, 16, 0, 3, true, false, false, false, false, "2580fbc5896dedf76e8ae1af6d33cc44480dd337ac8312b0e112f689a9596e1d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 155384, 512, 2, 32, 2, 2, 8, 0, 3, true, false, false, false, false, "07d0a698e20461856728bbb98f20767804be64bd7adb4c141695f56887914ae5"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 124016, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false, false, false, "127877f43df2b516483c5e84fad79c4b0f9f1300c1fd65ac30ebc1de10f68572"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 120560, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false, false, false, "9a86ecd5d29d82f925ba6386417023f109e615030ec1179e2ab2a979dffac5ce"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 64752, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "1a57555afe868396a607b68563eebd3f4841afc9d42e19e058b66ed69f17b6eb"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 64656, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "da584d56225e6647e0848f48f76a43f93365b361b31e3362e2f0b1974f4d04a6"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 125136, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false, false, false, "c81942c37d5ba125201e77a281798ac6be2b3d1e0cfee13ea5cd7a83ed422539"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 124016, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false, false, false, "e2971443adbacc24f246506effec0b0478fcf2aadadfea56fdb945b385ba5f1f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 121168, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false, false, false, "89674128b8fc2760b038446a97b3c5cb39db716da35a76848f2551953b63b688"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 120560, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false, false, false, "2bcd93fecf10f93a65fa8ebfe9e1c14fead4b5aa60a5dd36760dd231eab1cf49"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 158840, 512, 2, 64, 2, 2, 16, 0, 3, true, false, false, false, false, "efce311aaeacfd31779d869549027c811c7ce59a2082c949a15c21cc638b933b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 155384, 512, 2, 64, 2, 2, 8, 0, 3, true, false, false, false, false, "16c7bb25f8e8369a41ebf46902312bbdd6479bebe981a82803788354ede97068"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 124016, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false, false, false, "b393eb438affa7a846d2262e2855e78a2e1da71e3b88bed5a445bc85406b8f3b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 120560, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false, false, false, "6ebe217e90a0dbfa14c3932b76d4609cf06175e8f8207623942abb7ceb2471a2"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 64752, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "1d0d7f3bc80f385207f6585ea5054f8096a895789fc0871121b27467e460a6e3"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 64656, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "d799be43b1a27aa75d3abf43f5fcd0960ff28324b9980c99a9002e97529b230b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 125136, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false, false, false, "17b4a3bc8ace51f359621f16b7749ded041fffbc2c24a8c208160628de77d0b7"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 124016, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false, false, false, "f55c3c102d52e7e0945532254c693927e1144d046822f1fd22688df35176d6ee"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 121168, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false, false, false, "a2481137a29866f336db568a3e39d7e1101ce0fd2a6de4419b3a5c3aee7b294d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 120560, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false, false, false, "067a651237d2357ab09b9c0b3ad5632c614ae6d60acbd68ebe52b81b675ff01e"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext", 164208, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "d3ef53cf3152303eeef22139d1066c6122764653d8d0d23c5ec6e30c89aead2b"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext", 164112, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "e49f73056ba6879dcd684ed211d684745c211fc5dd2562969cc5ac51cd5a73a0"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext", 164208, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "03da687e50bd1d8f40307e533af5481d54be21b410bb8fe4fde5907443eab7dd"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext", 164112, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "25ca5d20d3d6b4d1e8356475c4d92d7c0e40c210bfaebd3b0df58aa968cded68"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 164208, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "f70f9c2fd3a7a21b3dc2a953fc9c00e4eaefd297a5a7289b2c88e47ac0ff44fc"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 164112, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "6fc9e7c83cc544ba18f1dfe0832981c93e5133709197a1c7bec241874122caa1"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 165072, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "222238a9dd8dec2b913b26c5c55acac8cc44c65651808a1bd57839d5205a2a44"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext", 164976, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "b3c19f490074286776c65f7f2504c9ee745f586d323e236dd3fb3f287eef78c0"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 165072, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "7a0e360ac921d5d6540533e95571230731f1ffc20a2a2aa09f11de156d53eab4"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext", 164976, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "686092b177ad49080adf04dc6b165da89e44fe9490137b25745ad194177bc368"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 165072, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "79c395ef232a280234e7ae0212c373c2238dd9fd4bebfe1ae5143d65abb1abdd"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext", 164976, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "c3730cac4f311f6b1d8d8c5b1c026ab3450cd5d10e50a707e068355fc86ed00a"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 165072, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "1b9be99d95dc504154ac80d10af9d5fc25ec92c4dc9dfbe259cce853f82ade97"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext", 164976, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "a11fdb48f481e747fd4a206ff5ce915c1a93f97925ee16607b0cb2972d506994"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 165072, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "3348d66a5b441fbca450a6c2fa870c7fca8247ff3329dfc9a5145f4f179dc65c"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 164976, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "322caf1ad82758bddfb734fca4056ce258580f1b7724357b143cb3751801f42e"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 165072, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "bf60a478fba8cbf16fa8b2beb1f6634a799db0d4c2e5bad2dfb1c2f2040392ad"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 164976, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "03449ccd017926be8ac6e88a98cbf81fecac6e1d84bf51a80126dfb24f8f6768"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext", 196928, 384, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "b0f857ec836447de63562c192ba78b7e6fce1214c80b8bfc3bc28306f73e7b7f"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext", 196832, 384, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "ec503ab4914b8e5e91464cd95c50f6a520464f49b6df4c919fa76b0a2276d49d"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext", 196928, 384, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "f58af9586243cf2fe94bcd0a99ea557cef829ce3a66c976f2b647e4d956fab47"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext", 196832, 384, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "f31ff02a352aeb93df080b77154f209da7775b484c810147fed08f2dc5a80f05"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 196928, 384, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "bc4243ef05beb39c9a82b13f7bce858aa9ab70ad52fc83f0616f313eec7dba0c"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 196832, 384, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "bdc77dd5b8f399dca461a1a2d03ec2e283ca834ef75770a9e73ebe04355451e9"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 197792, 384, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "fd877921c396bea0bf993d767fe9440958628eb30ab4fdc050129bff6f9a9834"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext", 197696, 384, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "1aad80c4b45c9fd2204116991781f988cece34f0c43132e9d311b62dc2899b82"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 197792, 384, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "ab312ef57e71e20e45c67b072d61d323c40cafeb3ecc6a231880584b40c92d28"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext", 197696, 384, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "6935347240e1fed20b7f3efee678c5bb4bb1512e2c341590e28c91b446818f6c"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 197792, 384, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "4ec08a4a82b0b29263b5b073f40c492cd1c238325e4a47de04dc11f621d3e80e"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext", 197696, 384, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "1632ed7994e2c4b0edec50731c97af800fe442185c4e6fc7f1b1ecb4aa0fcb2e"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 197792, 384, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "da4000aac82326e4f43ed74e214fc1c7c220bcb70a734ef972fb27013fcc6e7e"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext", 197696, 384, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "c4ce90d8165312712be77cbefacd363c7d0596d83252b15475b3d8df4c4148cc"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 197792, 384, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "f7a19561eb5de400d392bcaa272b9ca914c522a472e8391c2982d9561573b963"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 197696, 384, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "120f9b7054c042ac44201bf00954dbe4d55a404b95b4cf4647ca8aca2357c143"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 197792, 384, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "d84389965bcf53cefbaa97b828b98ab58cebdb4d18109fe5cfbe71ad3f30bfcd"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 197696, 384, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "98a73fd759b07673797a883c026050f9ac4fb2e1c8bbe51113667f9d25bccad8"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "ca62d2f6377a0bd13080a19c5b029f6e024514a7f7cbaee7d98dff5afab8e851"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "b9916ed4097e6ec29d93ba1048ee5b4c0bedb9dbe5248d3e4359474894b3cb20"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "b48500c7f8e1eb09ef85aece90bab46d0981cc4cbd6bbef5e6e057dd910e2d15"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "5baf200a5a104b73c4d511d31b05bac0d637f9c703fa1516859cd8df879cfe1c"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "f517ec87f78d21dbd197573592aa8d57349a317f1c8915bc8b65be7fa96037a0"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "6658daa15285fbc165fab4ff3bda0f66aab6f7845021401305afb15f4353aeef"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "5b3728bab415731a8a37e804f89f2076529198049107b3822faf24a21d51ef41"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "a017ce9aa1ecbba833d48a6e0b40113d865002e8d61cd316c1724fbc0d03c9f5"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "9125866a7abd25751ea0ea21c1ff429180495b55b98925e1230198ba445d9f16"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "25e66da4037f63713773d176ace63c21051c082ef4ed471ac34ed95109c79e50"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "2cb6f3750e4c30633c7343f83448eb2065d084de5636af9a39fd1fde4bb24f83"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "3369f08d22c4a4016c937cfe45c7146fa62566d858f58b02365b6201b2803a59"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "ff0adbcb6aee973b8270d156940ce34360949b821bf68336b9676ac26224b9c4"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "661886b7ca4f9f5481df511af4100783a2552d9d9de3b93d420a4d4c5b1c9929"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "d5e4c791a7f4d7071d1424fce55f8f80d65996082226ec7a46080351aa78bb5f"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "037404cf3f2dc03ca7a65a0c4121ad1e69ac8bdbf96fdf48376cfac923835b0a"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "546f29535ef7135498d4b0fa1b1b09f0496d4a0f392c32ba121ffd8fdcc3a06c"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "9c28c1d10633bd3687c120fa07d8241f76f10921b2b72a9fb23f1d14f7e86074"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext", 196976, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "33983e35ec695ed6bdf4108612e1aed6a53e0487651285d76dbc115324c5cfc2"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext", 196880, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "80271d667ad4caf46c6e44b68cc9cee79e02c4f5ea0435ddaa96422a32dc80aa"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext", 196976, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "bbe90255637d8db42b5b5832f7d90f7b09cfa94808abff928d0906c1d2ed9398"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext", 196880, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "6a19d2d2dd3ef3e3ef8ef00a9c59069d8e1d1aa4b63bac82f25117b4462d0e86"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 197840, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "13927d266bcd29c5c0a8af8cfe46d6a2667a990d6ba3dce1c64f4e1c69d2e4cc"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext", 197744, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "36e981355f6b1389cf2e61ec2f9c7ea053760dd95d480da13133ae3b678c261e"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 197840, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "70464c9405651168e735300c90f9a85ed7b96d5d51b1fcc926879474b6f28076"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext", 197744, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "97bcf8b9ec0bd409da7e93af6519f19e37b9f9dd99376dcd26d7283a35d88b60"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 197840, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "dd0fad1a5922c0d8e494e3c2ae3be6827c900f7e97848652c3650b00221d830b"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext", 197744, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "601c75671c2e8200b30678d647aae9038e16d0d9c4ad853658ef358332d26d65"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 197840, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "f49d73b6c96067b4186c841f7b5a4cbc84c65db3ad00625e28aeaf4c12b879f4"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext", 197744, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "a018ea409ee1143ff4a676ee72d468fadfe5c2e3acdd93879b7912e20dcf9f38"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128SeparateQkvCausalVarSeqQ128Kv128PersistentContext", 196976, 512, 0, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "943af2b3366fb413b08413127f89bb1763c9612c57ac4bc59d0b8a63a9eb90bd"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128SeparateQkvCausalVarSeqQ128Kv128StaticContext", 196880, 512, 0, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "4bb6fa7fa231928b0c65629e3c5b05c2d7c5b1ece3ec2ab413d3435a702b2766"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128SeparateQkvDenseVarSeqQ128Kv128PersistentContext", 196976, 512, 0, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "7d638e908db0529dbc6c6c86a6c33deb3a1f217ab351b2fb3f6343093359636d"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128SeparateQkvDenseVarSeqQ128Kv128StaticContext", 196880, 512, 0, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "fa4b814ab09c41ee4de447afd7a90c3b5675e98ebae5c8e77c9b406776f12e1b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "35cd9b42ef3498996a9cb380a6d1d814ce0da1947dd5c58e0bc4d059b623e041"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "5126e4f56a25290c2b8d1f330843d6c8591231ae413040062dd8b438bf005011"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "dd046dc7b1845540c18b89c34c41ea18dc1e1a29085602978a09c9a3175212cb"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "4ad605c2d8ffadbca3d29353705e8c371915a1b0d94b58d670f2306d10bf7b71"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "e0b875c5a0ae8e7c2945b48e749b375012f5d22ddb49a7c3e62a059165195f96"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "02200c05bbec8791832887523523366175d5784dfbf671128a66a6913370f81d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "f728b32ca36a0e6020da9ab1879387d0dfcb6c6e663d0c59d499a952986709b5"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "ad33ab04276b5fcf2ab2f6cefd51335c8e8bcb17f0045b0ba9f06f26d49b74f6"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "e05af8ec5f29ec8d104c977eef753a8b6b93715e97d78fe06faf84f68fbece14"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "8382b55bdc6f5b96cecc842ae32f63cd7234847e3efbf0b1eab1141422d0e987"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "d3811617e9fe603097324687c57c4eaf8c4dc4abadeb18bd2b377230be72e6c6"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "ba2005237f76f5869b0a485bc6626ebbdd8a6ec9bb7744fb4565b3a098244044"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "f50f828ceea79bc786c199f580b5a7ec4803e29061d025a5cdcbd614c319c3c3"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "2306bf599265c6e517a243d3b74d5d6f337305f7e8b79641ead03fdf44b0a547"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "c19f80eb0dcef908b5b8d1c0ceead9933b5a513485c13a6d4a5c2e14a6fd964f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "887ccfd85a30616c0c58ad4e8eea1476a790a3da4902d7e18cc83c153d234555"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "d8589f2c0222821ea9e51bbb819408ad278c4ec9552fdb5e36144bec5a5d24fc"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "cca3616a20e1b54effad66debe673dfe2141036668a22449930ff6b71ff7354b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext", 213408, 384, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "93408263c682a7d80429bb8a1289f16a598bae3284c6fc86a60d22b4912b2d10"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "b1d6a9cc6b275ae13f705df371e3c449a07b5194583764d431e016f84060dd35"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext", 213408, 384, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "ccf9b978862412809723d35b26edd4646ab5fc7249e19733efa85543c64c9754"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "05bc323d443d200e691312ccdd0d0bf3579e5f9626e218512600d213286c7c37"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 213408, 384, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "0817f661035cd9eb3f109bee3aa7a84a12b2c1fd02633318d85fe912bb95f2bf"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "0b578ed8b90603f4d220b70c0dad81ac23800c342f22b2673f812dd10380c4d3"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "72ad3c69f07d577491dafb9327197b6288c43769959ab05092feda3dc2d9e847"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "7cac3b205e7ebf2ce1c507266e822e5d3a2fef248f0089804d400510fb345427"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "b4cb07bc41adc8d4b5998321531be4ae3d153104c216c34f243e579b83e56538"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext", 214176, 384, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "64dfcc09d1a5ce5859b8e2cde6ba7460bbfac495944e63ae388936c83ecc8c38"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "01cb5a933c043b54d53fc63b8e29eac9c16e6d03e0a002909b5ceb112a3e7dc6"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "15bd9807f0d9176127fd28fd396e070e2631caf3de50d06fee6faf1a294c443d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "599dedc127cd124ea5507968205e5dd8db7fe8c0bbb31a492e669083db3e4da5"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext", 214176, 384, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "aaccfcdc786772f2b4d2aa33267ceebbf963994c86a236dc6282e6f7838a2d1d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "e8f010acd8914f1df8ebce4b064b06402c3fa87884b19c3e18cfe5833d2625df"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "6db593b663e49fd7e95bb998cf04f65f9c0902bedc188c37aa489bf902ffa137"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "5b568f1e38749f0ed5411373595e07a0693fa0b2f969ac1d16c69ac7aa29a0ed"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 214176, 384, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "bfb0403a6bab602491597d0cc806d72d85f9888a3ff0e83530b7ccd85e091bfb"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext", 41296, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "b927ab1aa4e7983385acc77ea780fba0d572f63eefc6390979d91cbad59c9762"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "4d31561de088e89031e458eb788e2a8b6651aaa7f385167155e5abca5619a501"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext", 41296, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "572245ea36afb7ec7caf475b6295856159fab32d3e94f82b5369056ee51cd49f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "a32d309b5894248e4d80b75a586701b27d54042a070a9ffd7864f1b271f794bf"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 41296, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "a6d1f1a2207cfd2fad701a90719ee5524033cafbdb322667eba4b62d1d8bb060"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "3bc1cef645542b0ae6081cf462bdfc25d50cce04d5a7bee4ff121662ad4f1d58"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "b6d3766ba4b37945c5275821118b92d09ec4b01374a22ced66a307c5708cd07f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "5a211df68939460edf28780d98bd46c8cc286f68529913b2cd19e94781355d03"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "b879b38edf501952f15b9efde6a19d56a62a47ad2a23b1dc20754f72b33dbbf8"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext", 42064, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "fb629ba8ba51a628be9565c72bdfe3b240d645702a4b48ab2ce674e6516236bd"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "f27c3d176194e9691be59e6d30c8467787da0474a01032d6f318438f7c051fcd"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "bd029c0f24d0e40196794bfb3d44b13a34924126946deff31d7158de4ff96760"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "80736a8cd4ed89b599c0c2c3fc819403ce49518f93bf0ab1e61b3e3a3747e043"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext", 42064, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "3161c4126c5a24f2d63fb399d3abaf150492ee87f0b91f55c723fb4433ac1b68"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "1f44e818e546f638a3cc25ae2a674785fd8c255ce1c7dfdbaca673019a877748"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "e9617006ac8da9b95a8c7536396a71ccaa5c8c0ee9866e1a23b68e3ff81840e7"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "90713b6980f809036f9bffc19a765e7bccc0a06ef1c4e020d9aa1790900b944d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 42064, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "972852ea83791ac60941df16a624c05bbba193ec9b9558f03eacedaad6e27288"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext", 115024, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "170081a70beb293b27ad2024e890e5fb342d2585cb8dea99d979a6e952a14431"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext", 114928, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "ffd149c2802544665f9a83092d6c37cc52083f9e33ddc80b2d2c8ab75a0c457d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext", 115024, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "0f14f17e08d3bfa3a8e8bc440ec9543573f23ba9b170a609d468fdc2c0918d7e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext", 114928, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "18609288798de4ad1e664829ecc51fa4d627aac44b3df1a5552c2cb8837e50bf"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 115888, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "9ced1100fa545cb1fc2d759d845a14f6cb44a44baa82ff8843475ce88742981f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext", 115792, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "56212a95dfa3cbcbec9bc040f829f7f9193f441959c84e87751930be2ef01b23"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 115888, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "4ebbfe62f0c2986b5af1317ae1ca3c69c4dc9fa5e71830f81eb1f83cbbd2cc5a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext", 115792, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "c8e7802d5bd5709e5997d3d8402d664062d0bf4acecdf1c64d8ea6ae87173fdf"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 115888, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "f07490d6b0e5b4e6d0d65b9679057f3c78ccc2f6d6cea3902b36a47c1ad69c68"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext", 115792, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "0b83ac5a5c26ac4e819c7dd66bfbb644c380cafabc9e4966c0441a4b8db71b79"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 115888, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "1a6a520543fc30fab1e9a827e44defa4a4e8b9c2b5594ca74db49c585bb514eb"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext", 115792, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "4eabd0507943f2c76867f0670ae0a11a4b361cb2d56d88d2f23d83e93d47e1d1"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128SeparateQkvCausalVarSeqQ128Kv128PersistentContext", 115024, 512, 0, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "f2e42d947bba45db9ca364950e3a005d06a5568643da130763f4d4626713a63d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128SeparateQkvCausalVarSeqQ128Kv128StaticContext", 114928, 512, 0, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "7e7b5b1d8d9e1ef6e73d44fbbebeb123d25221c108c86ca0c97636de830f1b85"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128SeparateQkvDenseVarSeqQ128Kv128PersistentContext", 115024, 512, 0, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "6c8e1561c777c9b18af2e5f499b875804518cdc6e0ced744f0dd8977b275dc46"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128SeparateQkvDenseVarSeqQ128Kv128StaticContext", 114928, 512, 0, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "305867fedfbc1f2aad8abe488ddf3e2f8d691de02f6b25803b84435a903f9b58"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "d8ec07ab25fea1a1b5ef5dbb6a27a1ccb5699652176d51b5609c321767f7525b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "b3fc1ff1c6ae37fb1e2f2ec64667d171e481e0189a2c5a8547f26fac34cbbf16"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "617c150a57938fb315966da841282a655ce7fbcd0f8b2c41fc44593e7bc2f3c3"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "014ca6ed77e613127001efcc67176c6ac2545c29697a818efbc150b6290c5fb3"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "67a274c355998846112679a9a3df4476a14ca4836981f3fc81bbd3e74f7ac684"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "4bdc8201ed0589e885fed4f9ff2f1442b4e71e4ff957e4637c0940296d3eaf22"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "91cc4280ef36620b79a20b88bd893adc3f00299fc169202b6ff64a7c3329260f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "33b0dc47969f6df59bb4619a61b54937e2ab01ac9f6fe94b1181121e6f226dc4"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "9a12f7b88c138d09b4ceeea1a34597273509c3db45e3cc3cfaf58f8981dc03d3"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "bf138563062daa80d416193f0fd60fd4fc2368728f1b63055ef142c4aa3ff44a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "144b003d9392b064ae1cedfa17e1aa4494d4bde0a68ae31cb1412dc9048a8842"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "7ea47fe29534ea378ce0e58e1a1bf5f8fc452bb0e3aef0243160fb3e99151cb9"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "36947c46a3f4853ce4f31de5b01ce8cf9b1bf2491b5c4d3f001fc88815875575"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "5ee2238ea7cb134ef6a7abc9e21af083e5a77da10e3756f455dd74dabdca2772"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "8f48d5d2fa7f11d7c8090f3a7743fef26069a73cd90b1d19efb1860cf220caee"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "17fff9e92e29717e21a489e1f9d024e60732528880643af21ec1fe8735520ad4"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "a9b141746d348a38d6efc062778ae56c72e24305e01ee1fe782524e29006bb21"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "d948582baf0d45319aa1b03b3f1ee6ef452a9953186ebf407cff36a9407d9aaf"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext", 213408, 384, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "d9325d91f51a3e3dd5f143cbfd7605b4aac0c530b933a268b67f59b272bf6b2d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "943d5653df28951afad104b64aeb5c410c83b266e3f9242c31524f4274788172"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext", 213408, 384, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "4d35ff8ae7a8104a77d57ab58d4e1ad12de618e4fc8398700a9918ee231fccdf"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "06a171df8227c438c0368aa7521e1e18e0aefeadcc1ff231b41757540064377f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 213408, 384, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "22e4f34ab9adefe4b7fb850086504632db090757d2bd5eb809b29d2b60997d5b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "dce999ffbd29eedd728cb4ae831935e6e838680c25fd2145a0c9de1f2b571433"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "4d5e5082b19e5afbec71bd03ff0e8bf4c44c9ee7678e42de9309c49cb51f88f6"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "6bede5df69b7825cc90d0fd27ba8de0486b43ac786d5da43cfa29d0b35b53887"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "f0d653b84222b3f16f85a0113ed48d6e3bec7d66519a493452515389c616dba9"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext", 214176, 384, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "bf3e8fa1ad57c0f7951aab9f625f99e71b83bc4bf05adf2c5704b34a7ece01c1"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "2ad3f43b2dcad53a568af8937d946a713e1cd7ab7d30981771f9c4559477408b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "5b36ae62e6bf5030ff2fe392d876b54cb03de6ff202f306b22ee0f1bf5c052f1"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "a8d195f66957c7c2aa686dc01a6b435c1614ad4166644ea416afaa00dcdda541"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext", 214176, 384, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "9bf621bb25e5ab8f0019ae0c846301a4e5ce3129e97320204f0ee5dcbde8d736"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "125d24d778d9ecd547d8dd0a73dfb3f05c088aeeb31a1202cb9390a1e8bce033"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "a36158f3237850a6104f16ea239f12c52c6f2d60ed12bd7b3669c26c219b3fe0"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "79f55f6b44e2ec220cb387bddc648d4af3638dbd3b1d7b33cf13f4769c8e4b38"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 214176, 384, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "795d59adb81386f0045f871c021b430841e2f69c02438652e784811336aa8ee6"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext", 41296, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "e57062e66ff3139600af2023419345b5941aacb2a071da4f705e9e4365c3f867"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "a4f7ce2282e3c331c7c0e7d5a9663d643406c9fea67598201a2c4012682e49b6"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext", 41296, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "eea0ee5bbd572e0d4d9704a12987c46ebe1b0d55f0cd002d3a6356b7c7194768"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "1f19222f99c5a8495c19e27f0d34099ed49dcf71420d224b214195ecc62bc9e1"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 41296, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "3ac47eeec918d46a1459daf42988e172e25b401ee381b77a71efbb577a8feb8a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "7526746531d42122bba7e5381892d48e9710e40cb52f0ae5fa9281ce972eaf60"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "1e11f7fb2ea0a9087bc584298bfeaba5613ec4c8239cb4aca9d3b2006455263a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "7146d11263067316abc32f60026707001e97b6a620ae8afa3a2b85839a4f8832"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "4fa24ecf819e4cb01af69d2314681712c50964a82d87d3bb4c2bdd17dfc34db4"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext", 42064, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "1353ddcc42f6cc66f6ee08fb22c45627b861ed3e4084933f843a715174552482"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "e4cb0df157e96212b565a79eb487f7935dd2012934b9afb446b063b4fb85f18b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "dbf527a74a2b0766b57535042ca6d6e0b5591ffd06e01921a67b51c5173dbfe9"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "2f3d6104b59b917948f6b3b5bbccad960e93c12a4a83631b9b07f770684feffc"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext", 42064, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "c7c45ea305e37e79be18e17904f16068831a77662126d72417f1e80773e8ee38"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "7898f08be043d3f006719e68d71f09f79bf473ae34aab19b3ab262a1a4924052"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "e40d31e2e6bc567267446b097ebcea9d413377af76000d66652e4d632365c314"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "de1da4839c51a07c0b75aa530385b88642ac062916ccc80f5211e87be1ef6deb"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 42064, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "8b75f0cd7eec17721b235585540445ada046537f6ccd6a83a185ed415848a149"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "c0fa2d6c625517745a0607ca6aa7c84a42e6d316115b673a4fa3bd003305fa50"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "e6ca692e7a0781ae0586be9e9f1d91f62e4cb14cf33758df690a986263b05b78"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "aa264d6bc47cb61cab2f18705bbd938b9f7fb900172521e5fe5c246c409368bc"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "85316ed101eb192471a1052c0da54c0d638c288391dd7a25dbc02296be7a296c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "2d2ec1ba2bec477e82dd86499babf318d7d940a9206a129c9bb756e7a3f359d5"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "af2e4f5f603c7136a8b15ec33f95c5d442358f8741e0c0ff72ef38626d97bf90"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "4c239e64009d24898a9c79cb139efa31f48d0b58fb1f6aef4aa7fd22900f7cd9"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "75d35f3b6edd9093e458734c307f849062eb65e6902b3e323507490e59c524d4"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "a03678184645eec1aba25cfcff50cfeb2aabb4a92538259f3660577bff0cfbc5"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "d0e7d2346fed92a3384390cdee8ef0d40c8bfbc04cbdbf8498db4c812e6d6001"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "360cf65b8814569a997f0b42e464a26f83aaf6edc6995afaf988bc0756ca465b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "f297ce8a5eed5adbbe03c30e6d4fa5617eb72de72527f5dd59ff6ef326498a5e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "e0daf676c0e64e2b4a7d1924764f28a774aaa8a5346a3c5def913bc21b5e630f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "c98f6cbed970cf184c9dc2280653ae8d9a3a858755c846fc900918a64577e2a5"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "7c27b56d1d5bfc83e7807c3aad9132f2a8b2c80c663fb326c18f59420dcdf444"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "57b2872d1e919a6226c26a8ed425acd881700122e9f70dcdc7321c5085ded141"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "16f6b52df67f622ebf94c8c8f6ef7f84f9a9bdfbf6ddfd1e0569e14caab40367"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "16fed94e9aa03d508e8179c8ed5c449edf1896202c1e024348851b8f46ef01b7"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext", 213408, 384, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "afcefb117b45ea7fb0980b0c6ad1fe5ac04f61332b9ce5c471e6f1af1e8b43e2"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "55eb006cefd5749074cfc1558187b64940f1a635eb6564521803d8e7b28b4857"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext", 213408, 384, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "8fd1fdca9c14c7e8ed9bc58c74ac97fd20aa8ff9036c5be7e915846c058f96df"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "87eac59ac6653b8854b609b0b5bdd7e48c100ecf9a7b2129e0c0d495672fd57d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 213408, 384, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "8d92a9fa28640997109579f6f4a0094f7964b79ef1bcbf937c8939867c13e527"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "489b736a717f881eeeb4e18a788c87c4554cfb67f03d6e5f6c4edabd28eea4a0"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "f46d1d0bbfa3e99ffe3621182c2542ac3fc5d252309c6bf94be044af7f98f7e0"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "49b37bf6b7b3ec563ca55f915efb3bfa4164190d02d26659b475aebb35c7e063"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "fbe7a013fb10e9d3198f607569a3a36e0d75db8b80a1246652d8b0af20e75169"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext", 214176, 384, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "c415175a6586c0d8c24e81b0e02687009ed725b0020763ac0d4d884ae578e4b7"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "5983405fd9dc4a016f9d4b5fe0e19b8e249033e1b88047695d52607c2c16a0f9"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "612fb001a47dd7f5edad06676a521d2909b49d55cce4db01bbd5ecccb3576f57"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "a4569b9389199160aed3930a80f77dbb667dd762eb2de60a779b9e2138fa3d71"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext", 214176, 384, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "dacb1f9c1b83875750bd5792c146dc6e5eac6dfa6acb67352902052d68e21611"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "70fd20eff5be6bdd3326cb8c2627cdbf414ba87a77b437e1782b45ab6ac4cb35"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "e491cd65ea0b67d1ece6df623dfb58a766bc6b4ea5100e41dfe49cc8bd3fcdb4"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "d078460c57819b4d0d1c3f5fe6ce73eafa2033f8d7657fb70b1168a533b2b25f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 214176, 384, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "5cdf5c0a903da2d995101012b3fd90c88ace027478357c156176148d93745450"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext", 41296, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "eb5d3e145e834d93699189d88a51243c9f08e43c1385bf010b06c239b3dd46c5"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "6d547b157dbad1ee259c650d721e30a8873f000cd6e3dd0256f4a25c4d348bc0"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext", 41296, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "a10445f07089ba8c8ad92ea0a70df62a526acc7fac60b735787c001e309f9ff3"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "090abf148986123b7f0b15d77374092ef7eb090efbdf48bf44822bd846142c08"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 41296, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "d5d7c82855b9d6aeac6707ec6eeed005e90f4a87a74b3909aa59e07a63b1fc4d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "5f0a48e82eb69fd2193b245e5628a9724e39df758981eebfaa2138551cabe9f7"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "ebf30a181d8315b098205ea7aeb3eb92bae50f7f37700092e40af7bdc3210574"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "2041c1bcff893c91e2895180071eda92459aabe6fe634828aca42b782689f75b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "5895afc9817af940c56abcf036ec8ff7b9ae03c1f658ed50283024c20568b105"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext", 42064, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "0edbc421ab05fb39e1b71fc35383b3d550a62c95a4abd293053a7ac9d8745eec"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "cdccf28ba3cc22d872d050dda3f102e6320aee3878b46348ddf987609d6bfbe7"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "be51ea950d9edaee840f0cb1218751ed8470e77f72bb8161a841e8b8b2edcd3c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "a32c4075650da102f824e322dc713ab303fac0c3363c0374af2748e757935e25"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext", 42064, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "7f4b1a37df4b8dd077a4c522bd6e5f295079a912dbc6cef017c195d7938ddd10"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "62fc5c05d9887edc1344f83682f3c6649e7a6a4bd8f94358d28092146fb161f0"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "b6644272548ba651268e6f84ba464585ad313e97d87eb7289c430a17748ba29a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "81206b5f912d7519013979abff703bdfd92db0e72b0ef1e93eda8b60c707426e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 42064, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "986212c4ca276774db46a551545a6234fbccabb5a5b35bee13f633e22eb45855"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "5ae09c82b34f3db257bdd94e5275314a2c4fedea39bfeb4f73acae00d0cd4ddb"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "9353795de2e8ebcfed23bd106079ef6d6b0dd5ff96b395056cf30b5088bf61e9"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "fc12ef83fad73825f52ad44468e4e4feac385311fc5054b122b11f6f74044c83"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "7a652556d591bbefc06b43ec1b5e9eae76f5d0aa6f97ce57b761ba655b96dfd0"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "ce58a1fe839457a8094a2a7def1761fb3619e171a79560dde428359d9f505566"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "0ecf55e3725696d8f4bc52431b5fe5a63ea3eeaf1907839d411ed04ce8419ebf"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "9fc91cf1354889ec2b67f1553986322ee2bae0024dd86103da264fbda9d9c7e2"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "5269ddaceb4bad03b6f3162c92ddd5b314ce7c936cce4bd71dd5f2dbc408ac92"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "c29682e11e319e68d1b5a5db1a6438bb583b0b61525fdc8833de09387217959e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "60a7a01fcb7b3cee19c304c31427d2fcb50812132ba01df8e6721d2a22bebfb1"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "1a4e099ceb3379cb53e2783a9f5ef1d18b7cb716bea715db43a45f665473611b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "1c0f8fba78ecdcfc7ccbfa223025c77df626f0e349e7301b7f3888a36d204168"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "e3878148d8a450894a0fe3935a333f84b2364eb7101ebaf970ff2d90823145f1"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "00cb11734a5e21e45cc98769fc0079a4868abb84a43c60fe686b654a9ddfb98a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "91aa2e3cdcff6ec2703e411344b62769648b8612db50bc9f2628366c8bae8de6"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "39e713b9f51404b56629ba2c03941daf35c9d0266721dc0b5cb08c50d0f7aa17"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "6c5d39b1589758e2c6eb187ccc528418766ea9d4a06aed309a701fa658c676c7"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "ca428b467f16e18f3268fb9f693ec1e2925cf1119302ad154b209243c0afd679"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext", 213408, 384, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "d41f7470f967edc52d17067e26fec3569284d9883f5e3bac612367956ac9851a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "abf36a35ca9d3ec7795edca478c803d616ad6f0d1b067421d57b26f63cb72c0e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext", 213408, 384, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "5b8e951a959928def41a140f3518566b818f827dda71a93dedee738e0db8b0fc"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "15446a717f8614aa8d0ee6d1fae4935bdfafd3d6495469cf49e13f8abb8e06fa"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 213408, 384, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "928b2d2b3f674ea10c909f72d411b8410bac6281b031056b5d890e226e21570b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "16c6f70dabc5ba230f5f0daf37c3cdd5ec79b7ae7bc8bcee0fdf1031786639a4"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "0bf88739565b7f08a38c781a2e49f421a882a9afa81ea1a317336156a1786d42"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "3ecf158d1ce0468f1b4d018b1f03d4bfa67f52a770c735ded265fe2eaba08ac0"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "58343706cda0ade14d3b1a1eb276bcd9ffa294e6f025f9bfa671256adb70fb67"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext", 214176, 384, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "d1acb626847b4bf4822563f4d529c90038adc29240214754a9d614bbda876f6f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "d39c41027acdbf1113dda57ee66199c39569ffc3af673a0b3f4797e4c6c35811"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "97685d2389b58535baf05dbaa92adb8b0704a1e0db78af281f60ff7d74ebd168"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "66e0a0231587b0f897edfd6bc89cc0ef5263c37f1ec4a77b847852dd1fccccc3"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext", 214176, 384, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "c786f32cde2501b718b3e5aededf4d1adc7d13238d8ad40fbf7487512457b5de"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "169586a393b78d5ac46fcddffbeea6adec94914b9cfe5717eb49ae98994b629c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "95cd0219cd0ddb6209349f18ca19549e059ec399c879900bc2924674ae057c25"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "ae72e413f4496306794c4bd790e8540ca67ec3a570100b6a2d13edee76087bdb"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 214176, 384, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "cc32129b827d55206992311fba216ed7f912567ba55bceeb4e7cee21a97a8f9f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext", 41296, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "da81af4a26292003aef93cb08da414f3398730fd4f137d7ac09edd5e9062cda6"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "6b80a93f7cc683151eecf6e769f9d7cc195886dad21a54c2a70c630692292287"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext", 41296, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "0576977c5a4e52f255af1791fbe613ef7743728ceef8a035b18b2bd349a25220"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "ab12f7632e3a639ba2309728c1e922a51e6ad7665adf1f0f38b7846a498ff4c5"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 41296, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "55ee7d9b5621a93316f5a1cafee41256070e2c8948ab655d62f15cdbec90005f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "9f0250f9e68d3f22cfb9106c0d4e2e814341146edc55e1157397ad19312a79de"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "05925425256a3a5e0a4f337d3dd43fc4b405377318cffd59be80e4828fcee543"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "7437a6adc4c7de8518028d83839c4fbf26e4a626e78d625968a8f213c7ad5b31"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "28270803d084ea77d15d51f711fb53c38a2367d449a97969e9550983fc1a2b63"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext", 42064, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "69fd313dd39f6aa2638200cec1163d48639ecde21f834d0b0033120dafa8ebbe"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "1443787fb038a4c6ec9032ef9d260bdd1e74cf8abc2654512aed50cba1b62895"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "1186bf72e6657767c768a14db6b915f199fd82e907604e8f8afb1a67c3eaa5f6"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "08cc6797d6827cff8eeb485ec13c6361166df31891d43816ee5748afca068eb6"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext", 42064, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "55fca17259324e689967ea7d82cb6e35e6c0b06b9133e927e4f2007893b74b36"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "be5612dbeea96d69c09e31b5112558121b00064ba85041a41adab0d177e55816"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "744f29883c731833ce00790bb00ed6a721464c3010fe2190b579cd6698cece52"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "6ffce0f9972beb968ece8d23874d220f88929316fe81a1bbd5d37d50e42741bb"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 42064, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "aa71cf66265ae9ce0ac7fa1d07be3733f480b48a92fa0a2dba96b72eba227e2b"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext", 164208, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "1f02d316ec3df836c8a262aa5db765cbeedfe4f9616385aa8aee7ffad04fd36e"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext", 164112, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "560cdd6daaa3a348a20f44dd687c519fb0b4b51a162e855539a2dbd30b317f00"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext", 164208, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "e2d94183b0e30accc6d6c6bf437b51d612939831134e810b356d793c092bae34"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext", 164112, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "c116e0b14b2b321cb0a93e7519e80fddc5af3df400f9bdd1920913784821c5c5"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 164208, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "bdf5231cbe222e661a67812cdf2829e4f1f25b947714c24bd84559316d96a719"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 164112, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "b06fd3381a60dcbe984a1c1427848858a202c635a89831e0ef628c97af697355"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 165072, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "c8787d5a7df04f52c988c64c7cff9cea3d84be89ba65b13f14bca851202f8d05"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext", 164976, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "47184ab746be0b88a2c2a6a418283fce68490f3c47a3c29eb3ca3b0b3dcf0a51"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 165072, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "dc8a670d26ced424fb47058f9e8d9f51dcd4383e86bb5e665f3f56d01f5d9bd0"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext", 164976, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "963ac970c070c834a078edb72f1f0c1de830917e72b0149ff733e9720f2a69cf"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 165072, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "1c65c5a14c9def975371a5476c49c42db4a2d5cd23c2db638b9d7286e0f44dd3"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext", 164976, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "95c5c9d1bb6023ad115ed8ae8b75ab317c9142a5aabc7c0ec2b39d6950d9887b"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 165072, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "6ea44f57a56fac0fbaa388f9da2f0e2b071ad6b9b5fca99be4ef2e44cef892aa"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext", 164976, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "49d7fe4376a9e7411f9f45df2307b418d1766f99e95214347a721eefd916d244"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 165072, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "5679e5d51d637103396228e8719a761361ac15101edba1508550a9e50bb88743"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 164976, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "a0e1b51c0efb8f0864676cc68039c186c13b1f69de9a8d7b816b13a20155158c"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 165072, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "50f346d23f930c1277cf24c293492c10756208061c003ad86622157593a9d1e8"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 164976, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "4b1522f0913d209e03a72068f66477a9791d55274f656aeea90ab1208ec73156"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext", 196928, 384, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "01ff9dbd2734d11b5f273ee4cfaa04c8cf9bbee796bd2a902877ddceb31439ff"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext", 196832, 384, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "1211da1a72937b17402db25ee79036e39fa00da71ddc8e34ea923857c4ccfc73"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext", 196928, 384, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "a2120aae610b64ec82e0d4845cdcda650834eb9911e12a77f130766148dd71ff"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext", 196832, 384, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "b7a661a78716067d060185740c7086e6d354244327244981efc34ba619522d84"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 196928, 384, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "c6cdd7365d565a5de9d40e8d379d90d499eddc1c6c8110a884e07613acf70054"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 196832, 384, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "ec2887d02d985dd0c782fa6bb6acf8b668c01172b62ed9528a0bb4e549d93a8b"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 197792, 384, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "a3d4d6526c3693e71d4ce051f181ac3441ee42409c4160a4300dbd7676ec2e81"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext", 197696, 384, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "77442c9fa68fa3d6375d35adf57c5621eef905aecaece965ca9920eedc8f9763"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 197792, 384, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "e17ba02f62bca92562af42ac5fb6ce045be925ba460998a25f5bb24b0036465d"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext", 197696, 384, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "5b1e7caaa771ef24ce28f820a43182d44a46e193d54bf090c3eb0d59d0d949aa"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 197792, 384, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "47fb47089120fa77dd32ffc1f0a8d5d35082392445759a9f0d21aaebda8b5bcb"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext", 197696, 384, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "ab95ed920017c83e85b71a2f3e6cee0c5746210be641228b1e900f8a183bec13"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 197792, 384, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "e68b0aeb811e59d93e3adb75c59685e5cd1f69ec516d822c13dd2996843abd65"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext", 197696, 384, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "228c5c5e519329aebf09cef72062ba10dfa2933aaf25c25005b7751296e8165b"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 197792, 384, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "25cb23c05ecdb47b854d94cf84e5ba1c8408c24a4044101f325e22f0ba7e7040"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 197696, 384, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "b05bf9145cad8cbc02ea3fd651789fc441b74c066e7fecd1906cda93e5800843"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 197792, 384, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "a1f05a7c84112c2621e858b3200938ea2d3323cec860bbbc02ebcfdccdd84f6e"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 197696, 384, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "3be1542f9365be539de5fc7f8109f332c4bb1c1b10fa305960278b6ab2014df3"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "60dd62bba4775f562b14f0998cfd105ba6317f72c13b2f309cbdd9fcb442ba03"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "a97c1779cc4ae0bbcbecaf3df4b24ec52c176e5366b801669047b395206f5ee8"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "eaede5d634bafc9c29d331bbe9ac16ec33763dae345ebc8eb4a42c12b33d123b"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "aa099c828f44bf0f8cfdf8f4dbcbf8b521110ab8db36e61a066ca7010cb2bfb7"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "a58ac0e0b913c441fe3cb542bb367af85b94a6b9453b24ee50e8abb2093aed38"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "c53ab19beb06d26473d492c04641315b32565f1159814f0a800bfe91b9773a4b"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "83c945fe1276e8ad5e7c7883c00c76cf2b2343cbe86c17b15f99590b94037935"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "c7eb2080349acc3860fb9d7702140c19333234c3987f8870bdbe1e53ae703342"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "27e7f095fea12aac97a82e7a74502eb788e7567531507c05f9887da060697d10"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "91f80741f4dcb2aedc6bc27ab7145818b998e81bffd12f8235043ea8159bf3f9"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "51f5fde6fdc919c0a38aae1f537bbe3ccef479c8d083fca265dccbb911b2d7c6"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "99204916b4ed12a8884b98570e06c9dba63d8d19bb15024d6ec7a74e421a04e8"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "041147f93c30dfe4146b1acbaa232c115d3a47639370d5610ec1d9ae74ae2c09"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "1848b3a3bc8894b7145eae3caf9903226030ed8898f1d71a3d4c771a8a5fc4e2"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "e454e547b899dedc442df595198f801d84b98f47cb1a3ff848cce24472429a10"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "e18d8f859b1b5f368bd09ed50c1e44bc4228b83e740d399e407b9fff88b6c0f0"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "e582677e6a79c0e57c3fbcd41321e7082ef5cc33f63e237719817c0d91624e5f"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "019056de5fe0a1a9e8456e2b3e4c3807478216b232905a1a22c320c100b18be8"}, -#endif // EXCLUDE_SM_100 -}; +extern TllmGenFmhaKernelMetaInfo const sTllmGenFmhaKernelMetaInfos[]; +extern size_t const sTllmGenFmhaKernelMetaInfosSize; // clang-format on } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/kernelMetaInfo_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/kernelMetaInfo_cubin.cpp new file mode 100644 index 0000000000..d6e0a71040 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/kernelMetaInfo_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:58892823173fd43ae549acccc4821c4eddc1605cce202489b0d1f425ebe279e3 +size 1573155 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/fmhaKernels.h b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/fmhaKernels.h index 624c7833b3..4eb5ac5266 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/fmhaKernels.h +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/fmhaKernels.h @@ -17,6 +17,7 @@ #pragma once #include "cuda_runtime_api.h" +#include "tensorrt_llm/common/config.h" #include #include #include @@ -34,8 +35,8 @@ namespace tc = tensorrt_llm::common; -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { @@ -702,12 +703,13 @@ inline TllmGenFmhaKernel const* getTllmFmhaKernels( { #if !defined(EXCLUDE_SM_100) || !defined(EXCLUDE_SM_103) - return TllmFmhaKernelFactory::Get().getKernels(sTllmGenFmhaKernelMetaInfos, - sizeof(sTllmGenFmhaKernelMetaInfos) / sizeof(sTllmGenFmhaKernelMetaInfos[0]), dtypeQ, dtypeKv, dtypeOut, sm); + return TllmFmhaKernelFactory::Get().getKernels( + sTllmGenFmhaKernelMetaInfos, sTllmGenFmhaKernelMetaInfosSize, dtypeQ, dtypeKv, dtypeOut, sm); #else return nullptr; #endif // EXCLUDE_SM_100 } } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/fmhaReduction.cu b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/fmhaReduction.cu index 49f1cdbe88..1a0cca54da 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/fmhaReduction.cu +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/fmhaReduction.cu @@ -16,12 +16,13 @@ #include "fmhaReduction.h" #include "kernelUtils.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/envUtils.h" #include #include -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { @@ -393,4 +394,5 @@ void runFmhaReduction(TllmGenFmhaKernelMetaInfo const& kernelMeta, KernelParams //////////////////////////////////////////////////////////////////////////////////////////////////// } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/fmhaReduction.h b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/fmhaReduction.h index dd771f123e..c717e333c6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/fmhaReduction.h +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/fmhaReduction.h @@ -19,9 +19,10 @@ #include "cubin/kernelMetaInfo.h" #include "fmhaRunnerParams.h" #include "kernelParams.h" +#include "tensorrt_llm/common/config.h" + +TRTLLM_NAMESPACE_BEGIN -namespace tensorrt_llm -{ namespace kernels { @@ -33,4 +34,5 @@ void runFmhaReduction(TllmGenFmhaKernelMetaInfo const& kernelMeta, KernelParams //////////////////////////////////////////////////////////////////////////////////////////////////// } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/fmhaRunner.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/fmhaRunner.cpp index eca8d18d15..da476d1126 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/fmhaRunner.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/fmhaRunner.cpp @@ -15,14 +15,15 @@ */ #include "fmhaRunner.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaUtils.h" #include "tensorrt_llm/kernels/contextFusedMultiHeadAttention/fused_multihead_attention_common.h" #include "tensorrt_llm/kernels/multiHeadAttentionCommon.h" //////////////////////////////////////////////////////////////////////////////////////////////////// -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { @@ -76,4 +77,5 @@ size_t TllmGenFmhaRunner::getTotalDeviceMemory() const } } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/fmhaRunner.h b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/fmhaRunner.h index 4d2c6f9cb6..b42a61a818 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/fmhaRunner.h +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/fmhaRunner.h @@ -20,10 +20,11 @@ #include "fmhaKernels.h" #include "fmhaRunnerParams.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/kernels/multiHeadAttentionCommon.h" -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { @@ -59,4 +60,5 @@ private: }; } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/fmhaRunnerParams.h b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/fmhaRunnerParams.h index 90907f1352..b43f70b713 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/fmhaRunnerParams.h +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/fmhaRunnerParams.h @@ -17,11 +17,12 @@ #pragma once #include "tensorrt_llm/common/assert.h" +#include "tensorrt_llm/common/config.h" #include #include -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { @@ -362,4 +363,5 @@ struct TllmGenSelectKernelParams }; } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/kernelParams.h b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/kernelParams.h index 7961213f2b..fe33ac5890 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/kernelParams.h +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/kernelParams.h @@ -26,13 +26,14 @@ #include #include "tensorrt_llm/common/assert.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaUtils.h" #include "tensorrt_llm/kernels/multiHeadAttentionCommon.h" #include "fmhaRunnerParams.h" -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { @@ -854,4 +855,5 @@ struct KernelParams //////////////////////////////////////////////////////////////////////////////////////////////////// } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/kernelUtils.h b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/kernelUtils.h index 2d08684105..5f4e2f6b71 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/kernelUtils.h +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/kernelUtils.h @@ -16,12 +16,13 @@ #pragma once +#include "tensorrt_llm/common/config.h" #include #include #include -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { @@ -170,4 +171,5 @@ inline __device__ void convertToFloatAndAccumulate<__nv_bfloat16, 8>( //////////////////////////////////////////////////////////////////////////////////////////////////// } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/prepareCustomMask.cu b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/prepareCustomMask.cu index bcae09dd36..af267c5901 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/prepareCustomMask.cu +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/prepareCustomMask.cu @@ -15,12 +15,13 @@ */ #include "prepareCustomMask.h" +#include "tensorrt_llm/common/config.h" #include #include #include -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { //////////////////////////////////////////////////////////////////////////////////////////////////// @@ -292,4 +293,5 @@ void runPrepareCustomMask( //////////////////////////////////////////////////////////////////////////////////////////////////// } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/prepareCustomMask.h b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/prepareCustomMask.h index 178c104f65..86160a0aea 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/prepareCustomMask.h +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/prepareCustomMask.h @@ -17,9 +17,10 @@ #pragma once #include "cubin/kernelMetaInfo.h" #include "fmhaRunnerParams.h" +#include "tensorrt_llm/common/config.h" + +TRTLLM_NAMESPACE_BEGIN -namespace tensorrt_llm -{ namespace kernels { @@ -31,4 +32,5 @@ void runPrepareCustomMask( //////////////////////////////////////////////////////////////////////////////////////////////////// } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/KernelRunner.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/KernelRunner.cpp index 726a2aea7e..cdac59877d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/KernelRunner.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/KernelRunner.cpp @@ -24,11 +24,12 @@ #include "KernelRunner.h" #include "tensorrt_llm/common/assert.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaUtils.h" #include "tensorrt_llm/common/envUtils.h" -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { @@ -231,4 +232,5 @@ void TrtllmGenGemmRunner::selectGemmConfig(int32_t m, int32_t n, int32_t k) } } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/KernelRunner.h b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/KernelRunner.h index 6bddd8cf3d..904cc8ed84 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/KernelRunner.h +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/KernelRunner.h @@ -16,13 +16,14 @@ #pragma once +#include "tensorrt_llm/common/config.h" #include #include "trtllmGen_gemm_export/trtllm/gen/DtypeDecl.h" #include -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { @@ -57,4 +58,5 @@ private: std::vector mPassingConfigIndices; }; } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/KernelRunner.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/KernelRunner.cpp index 25eb9cd915..b1bc466b47 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/KernelRunner.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/KernelRunner.cpp @@ -18,12 +18,13 @@ #include "KernelRunner.h" #include "tensorrt_llm/common/assert.h" +#include "tensorrt_llm/common/config.h" #include "trtllmGen_gatedAct_export/GemmGatedActInterface.h" #include "trtllmGen_gatedAct_export/GemmOptions.h" #include "trtllmGen_gatedAct_export/trtllm/gen/DtypeDecl.h" -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { using namespace gemmGatedAct::gemmGatedAct; @@ -144,4 +145,5 @@ void TrtllmGenGemmGatedActRunner::selectGemmConfig(int32_t m, int32_t n, int32_t } } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/KernelRunner.h b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/KernelRunner.h index cbd6bada46..7bbb5d9ad3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/KernelRunner.h +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemmGatedAct/KernelRunner.h @@ -16,13 +16,14 @@ #pragma once +#include "tensorrt_llm/common/config.h" #include #include #include "trtllmGen_gatedAct_export/trtllm/gen/DtypeDecl.h" -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { @@ -56,4 +57,5 @@ private: std::vector mPassingConfigIndices; }; } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/unfusedAttentionKernels.cu b/cpp/tensorrt_llm/kernels/unfusedAttentionKernels.cu index f6107d3397..1db236fc47 100644 --- a/cpp/tensorrt_llm/kernels/unfusedAttentionKernels.cu +++ b/cpp/tensorrt_llm/kernels/unfusedAttentionKernels.cu @@ -16,6 +16,7 @@ */ #include "tensorrt_llm/common/assert.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaTypeUtils.cuh" #include "tensorrt_llm/common/cudaUtils.h" #include "tensorrt_llm/common/reduceKernelUtils.cuh" @@ -27,8 +28,8 @@ using namespace tensorrt_llm::common; -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { @@ -2428,4 +2429,5 @@ INSTANTIATE_invokeCpTransposeToSeqMajor2(__nv_fp8_e4m3); #undef INSTANTIATE_invokeCpTransposeToSeqMajor2 } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/unfusedAttentionKernels.h b/cpp/tensorrt_llm/kernels/unfusedAttentionKernels.h index 57fd40b78c..1a8a7a7139 100644 --- a/cpp/tensorrt_llm/kernels/unfusedAttentionKernels.h +++ b/cpp/tensorrt_llm/kernels/unfusedAttentionKernels.h @@ -15,6 +15,7 @@ */ #pragma once +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/kernels/gptKernels.h" #include "tensorrt_llm/kernels/kvCacheUtils.h" #include "tensorrt_llm/kernels/mlaKernels.h" @@ -25,8 +26,8 @@ #include #endif -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { @@ -462,4 +463,4 @@ void invokeCpTransposeToSeqMajor2(T* dst, T const* src, int32_t const* q_seq_len } // namespace kernels -} // namespace tensorrt_llm +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/unfusedAttentionKernels/unfusedAttentionKernels_2_bf16_bf16.cu b/cpp/tensorrt_llm/kernels/unfusedAttentionKernels/unfusedAttentionKernels_2_bf16_bf16.cu index 2dd6b9206b..5d006ef4a9 100644 --- a/cpp/tensorrt_llm/kernels/unfusedAttentionKernels/unfusedAttentionKernels_2_bf16_bf16.cu +++ b/cpp/tensorrt_llm/kernels/unfusedAttentionKernels/unfusedAttentionKernels_2_bf16_bf16.cu @@ -15,10 +15,11 @@ * limitations under the License. */ +#include "tensorrt_llm/common/config.h" #include "unfusedAttentionKernels_2_template.h" -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { @@ -28,4 +29,5 @@ INSTANTIATE_ATTENTION_INPUT_OUTPUT_PROCESSING(__nv_bfloat16, __nv_bfloat16, KVLi #endif } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/unfusedAttentionKernels/unfusedAttentionKernels_2_bf16_fp4.cu b/cpp/tensorrt_llm/kernels/unfusedAttentionKernels/unfusedAttentionKernels_2_bf16_fp4.cu index 7588cb6e13..2236e205a3 100644 --- a/cpp/tensorrt_llm/kernels/unfusedAttentionKernels/unfusedAttentionKernels_2_bf16_fp4.cu +++ b/cpp/tensorrt_llm/kernels/unfusedAttentionKernels/unfusedAttentionKernels_2_bf16_fp4.cu @@ -15,10 +15,11 @@ * limitations under the License. */ +#include "tensorrt_llm/common/config.h" #include "unfusedAttentionKernels_2_template.h" -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { @@ -30,4 +31,5 @@ INSTANTIATE_ATTENTION_INPUT_PROCESSING(__nv_bfloat16, __nv_fp4_e2m1, KVLinearBuf #endif } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/unfusedAttentionKernels/unfusedAttentionKernels_2_bf16_fp8.cu b/cpp/tensorrt_llm/kernels/unfusedAttentionKernels/unfusedAttentionKernels_2_bf16_fp8.cu index a11c03d72f..9ae656040c 100644 --- a/cpp/tensorrt_llm/kernels/unfusedAttentionKernels/unfusedAttentionKernels_2_bf16_fp8.cu +++ b/cpp/tensorrt_llm/kernels/unfusedAttentionKernels/unfusedAttentionKernels_2_bf16_fp8.cu @@ -15,10 +15,11 @@ * limitations under the License. */ +#include "tensorrt_llm/common/config.h" #include "unfusedAttentionKernels_2_template.h" -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { @@ -28,4 +29,5 @@ INSTANTIATE_ATTENTION_INPUT_OUTPUT_PROCESSING(__nv_bfloat16, __nv_fp8_e4m3, KVLi #endif } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/unfusedAttentionKernels/unfusedAttentionKernels_2_bf16_int8.cu b/cpp/tensorrt_llm/kernels/unfusedAttentionKernels/unfusedAttentionKernels_2_bf16_int8.cu index b0aae2b69b..eeb063db5d 100644 --- a/cpp/tensorrt_llm/kernels/unfusedAttentionKernels/unfusedAttentionKernels_2_bf16_int8.cu +++ b/cpp/tensorrt_llm/kernels/unfusedAttentionKernels/unfusedAttentionKernels_2_bf16_int8.cu @@ -15,10 +15,11 @@ * limitations under the License. */ +#include "tensorrt_llm/common/config.h" #include "unfusedAttentionKernels_2_template.h" -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { @@ -28,4 +29,5 @@ INSTANTIATE_ATTENTION_INPUT_OUTPUT_PROCESSING(__nv_bfloat16, int8_t, KVLinearBuf #endif } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/unfusedAttentionKernels/unfusedAttentionKernels_2_float_float.cu b/cpp/tensorrt_llm/kernels/unfusedAttentionKernels/unfusedAttentionKernels_2_float_float.cu index 5ae9090c92..55e3e8756a 100644 --- a/cpp/tensorrt_llm/kernels/unfusedAttentionKernels/unfusedAttentionKernels_2_float_float.cu +++ b/cpp/tensorrt_llm/kernels/unfusedAttentionKernels/unfusedAttentionKernels_2_float_float.cu @@ -15,10 +15,11 @@ * limitations under the License. */ +#include "tensorrt_llm/common/config.h" #include "unfusedAttentionKernels_2_template.h" -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { @@ -26,4 +27,5 @@ INSTANTIATE_ATTENTION_INPUT_OUTPUT_PROCESSING(float, float, KVBlockArray); INSTANTIATE_ATTENTION_INPUT_OUTPUT_PROCESSING(float, float, KVLinearBuffer); } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/unfusedAttentionKernels/unfusedAttentionKernels_2_float_fp8.cu b/cpp/tensorrt_llm/kernels/unfusedAttentionKernels/unfusedAttentionKernels_2_float_fp8.cu index 48db782612..ba27fff075 100644 --- a/cpp/tensorrt_llm/kernels/unfusedAttentionKernels/unfusedAttentionKernels_2_float_fp8.cu +++ b/cpp/tensorrt_llm/kernels/unfusedAttentionKernels/unfusedAttentionKernels_2_float_fp8.cu @@ -15,10 +15,11 @@ * limitations under the License. */ +#include "tensorrt_llm/common/config.h" #include "unfusedAttentionKernels_2_template.h" -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { @@ -26,4 +27,5 @@ INSTANTIATE_ATTENTION_INPUT_OUTPUT_PROCESSING(float, __nv_fp8_e4m3, KVBlockArray INSTANTIATE_ATTENTION_INPUT_OUTPUT_PROCESSING(float, __nv_fp8_e4m3, KVLinearBuffer); } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/unfusedAttentionKernels/unfusedAttentionKernels_2_float_int8.cu b/cpp/tensorrt_llm/kernels/unfusedAttentionKernels/unfusedAttentionKernels_2_float_int8.cu index 495db6c89a..ba25c39448 100644 --- a/cpp/tensorrt_llm/kernels/unfusedAttentionKernels/unfusedAttentionKernels_2_float_int8.cu +++ b/cpp/tensorrt_llm/kernels/unfusedAttentionKernels/unfusedAttentionKernels_2_float_int8.cu @@ -15,10 +15,11 @@ * limitations under the License. */ +#include "tensorrt_llm/common/config.h" #include "unfusedAttentionKernels_2_template.h" -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { @@ -26,4 +27,5 @@ INSTANTIATE_ATTENTION_INPUT_OUTPUT_PROCESSING(float, int8_t, KVBlockArray); INSTANTIATE_ATTENTION_INPUT_OUTPUT_PROCESSING(float, int8_t, KVLinearBuffer); } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/unfusedAttentionKernels/unfusedAttentionKernels_2_half_fp4.cu b/cpp/tensorrt_llm/kernels/unfusedAttentionKernels/unfusedAttentionKernels_2_half_fp4.cu index a29bc7e451..ff3d2e87d9 100644 --- a/cpp/tensorrt_llm/kernels/unfusedAttentionKernels/unfusedAttentionKernels_2_half_fp4.cu +++ b/cpp/tensorrt_llm/kernels/unfusedAttentionKernels/unfusedAttentionKernels_2_half_fp4.cu @@ -15,10 +15,11 @@ * limitations under the License. */ +#include "tensorrt_llm/common/config.h" #include "unfusedAttentionKernels_2_template.h" -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { @@ -28,4 +29,5 @@ INSTANTIATE_ATTENTION_INPUT_PROCESSING(half, __nv_fp4_e2m1, KVLinearBuffer); #endif } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/unfusedAttentionKernels/unfusedAttentionKernels_2_half_fp8.cu b/cpp/tensorrt_llm/kernels/unfusedAttentionKernels/unfusedAttentionKernels_2_half_fp8.cu index c0a1f384ed..55f51543c0 100644 --- a/cpp/tensorrt_llm/kernels/unfusedAttentionKernels/unfusedAttentionKernels_2_half_fp8.cu +++ b/cpp/tensorrt_llm/kernels/unfusedAttentionKernels/unfusedAttentionKernels_2_half_fp8.cu @@ -15,10 +15,11 @@ * limitations under the License. */ +#include "tensorrt_llm/common/config.h" #include "unfusedAttentionKernels_2_template.h" -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { @@ -26,4 +27,5 @@ INSTANTIATE_ATTENTION_INPUT_OUTPUT_PROCESSING(half, __nv_fp8_e4m3, KVBlockArray) INSTANTIATE_ATTENTION_INPUT_OUTPUT_PROCESSING(half, __nv_fp8_e4m3, KVLinearBuffer); } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/unfusedAttentionKernels/unfusedAttentionKernels_2_half_half.cu b/cpp/tensorrt_llm/kernels/unfusedAttentionKernels/unfusedAttentionKernels_2_half_half.cu index 5d886bd817..5abd544359 100644 --- a/cpp/tensorrt_llm/kernels/unfusedAttentionKernels/unfusedAttentionKernels_2_half_half.cu +++ b/cpp/tensorrt_llm/kernels/unfusedAttentionKernels/unfusedAttentionKernels_2_half_half.cu @@ -15,10 +15,11 @@ * limitations under the License. */ +#include "tensorrt_llm/common/config.h" #include "unfusedAttentionKernels_2_template.h" -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { @@ -26,4 +27,5 @@ INSTANTIATE_ATTENTION_INPUT_OUTPUT_PROCESSING(half, half, KVBlockArray); INSTANTIATE_ATTENTION_INPUT_OUTPUT_PROCESSING(half, half, KVLinearBuffer); } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/unfusedAttentionKernels/unfusedAttentionKernels_2_half_int8.cu b/cpp/tensorrt_llm/kernels/unfusedAttentionKernels/unfusedAttentionKernels_2_half_int8.cu index ac9da4fa99..65f51b2f14 100644 --- a/cpp/tensorrt_llm/kernels/unfusedAttentionKernels/unfusedAttentionKernels_2_half_int8.cu +++ b/cpp/tensorrt_llm/kernels/unfusedAttentionKernels/unfusedAttentionKernels_2_half_int8.cu @@ -15,10 +15,11 @@ * limitations under the License. */ +#include "tensorrt_llm/common/config.h" #include "unfusedAttentionKernels_2_template.h" -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { @@ -26,4 +27,5 @@ INSTANTIATE_ATTENTION_INPUT_OUTPUT_PROCESSING(half, int8_t, KVBlockArray); INSTANTIATE_ATTENTION_INPUT_OUTPUT_PROCESSING(half, int8_t, KVLinearBuffer); } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/unfusedAttentionKernels/unfusedAttentionKernels_2_template.h b/cpp/tensorrt_llm/kernels/unfusedAttentionKernels/unfusedAttentionKernels_2_template.h index 32facc70c5..053bf5114f 100644 --- a/cpp/tensorrt_llm/kernels/unfusedAttentionKernels/unfusedAttentionKernels_2_template.h +++ b/cpp/tensorrt_llm/kernels/unfusedAttentionKernels/unfusedAttentionKernels_2_template.h @@ -18,6 +18,7 @@ // Separate from unfusedAttentionKernel to accelerate compiling. #include "tensorrt_llm/common/assert.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaTypeUtils.cuh" #include "tensorrt_llm/common/cudaUtils.h" #include "tensorrt_llm/common/envUtils.h" @@ -30,8 +31,8 @@ using namespace tensorrt_llm::common; -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { @@ -1865,4 +1866,5 @@ void invokeUpdateSparseKvCacheAfterFmha(QKVPreprocessingParams //////////////////////////////////////////////////////////////////////////////////////////////////// } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/userbuffers/ipcsocket.cpp b/cpp/tensorrt_llm/kernels/userbuffers/ipcsocket.cpp index b588838c92..945e68a7ea 100644 --- a/cpp/tensorrt_llm/kernels/userbuffers/ipcsocket.cpp +++ b/cpp/tensorrt_llm/kernels/userbuffers/ipcsocket.cpp @@ -15,11 +15,13 @@ */ #include "ipcsocket.h" #include "tensorrt_llm/common/assert.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/logger.h" #include #include #include #include + #if ENABLE_MULTI_DEVICE namespace tensorrt_llm::runtime::ub { @@ -300,4 +302,5 @@ ipcSocketResult_t ipcSocketSendFd(IpcSocketHandle* handle, int const sendFd, int return ipcSocketSendMsg(handle, NULL, 0, sendFd, rank, hash); } } // namespace tensorrt_llm::runtime::ub + #endif diff --git a/cpp/tensorrt_llm/kernels/userbuffers/ub_allocator.cpp b/cpp/tensorrt_llm/kernels/userbuffers/ub_allocator.cpp index 2e3e6dde66..7fde40dbc7 100644 --- a/cpp/tensorrt_llm/kernels/userbuffers/ub_allocator.cpp +++ b/cpp/tensorrt_llm/kernels/userbuffers/ub_allocator.cpp @@ -26,7 +26,7 @@ UserBufferAllocator& UserBufferAllocator::Instance() return _; } -void UserBufferAllocator::initialize(tensorrt_llm::runtime::WorldConfig const& worldConfig) +void UserBufferAllocator::initialize(::tensorrt_llm::runtime::WorldConfig const& worldConfig) { if (!isInitialized()) { diff --git a/cpp/tensorrt_llm/kernels/userbuffers/ub_allocator.h b/cpp/tensorrt_llm/kernels/userbuffers/ub_allocator.h index 05a4b6dd4e..d9e3494a44 100644 --- a/cpp/tensorrt_llm/kernels/userbuffers/ub_allocator.h +++ b/cpp/tensorrt_llm/kernels/userbuffers/ub_allocator.h @@ -20,6 +20,7 @@ #include "nccl.h" #include "userbuffers.h" #else + using ncclWindow_t = void*; #endif @@ -56,7 +57,7 @@ public: UserBufferAllocator() = default; - virtual void initialize(tensorrt_llm::runtime::WorldConfig const& worldConfig); + virtual void initialize(::tensorrt_llm::runtime::WorldConfig const& worldConfig); bool isInitialized(); UBBuffer allocate(size_t bytes); void deallocate(void* addr); @@ -70,7 +71,7 @@ private: protected: std::vector mBuffers; bool mIsInitialized; - tensorrt_llm::runtime::WorldConfig mWorldConfig; + ::tensorrt_llm::runtime::WorldConfig mWorldConfig; }; #else diff --git a/cpp/tensorrt_llm/kernels/userbuffers/ub_interface.cpp b/cpp/tensorrt_llm/kernels/userbuffers/ub_interface.cpp index 6d5f62b260..3e19f9ebe7 100644 --- a/cpp/tensorrt_llm/kernels/userbuffers/ub_interface.cpp +++ b/cpp/tensorrt_llm/kernels/userbuffers/ub_interface.cpp @@ -14,6 +14,7 @@ * limitations under the License. */ #include "ub_interface.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaDriverWrapper.h" #include #include @@ -21,7 +22,7 @@ #if ENABLE_MULTI_DEVICE namespace tensorrt_llm::runtime::ub { -void ub_initialize(tensorrt_llm::runtime::WorldConfig const& world_config) +void ub_initialize(::tensorrt_llm::runtime::WorldConfig const& world_config) { UserBufferAllocator::Instance().initialize(world_config); } @@ -30,7 +31,7 @@ void ub_initialize(int tp_size) { int num_devices; TLLM_CUDA_CHECK(cudaGetDeviceCount(&num_devices)); - tensorrt_llm::runtime::WorldConfig world_config(tp_size, 1, 1, COMM_SESSION.getRank(), num_devices); + ::tensorrt_llm::runtime::WorldConfig world_config(tp_size, 1, 1, COMM_SESSION.getRank(), num_devices); UserBufferAllocator::Instance().initialize(world_config); } @@ -71,10 +72,13 @@ bool ub_supported() } }; // namespace tensorrt_llm::runtime::ub -namespace tensorrt_llm::kernels::ub -{ using namespace tensorrt_llm::runtime::ub; +TRTLLM_NAMESPACE_BEGIN + +namespace kernels::ub +{ + void allreduce2_userbuff_inplace_launcher(int const handler, size_t const offset, size_t const elements, nvinfer1::DataType dataType, communicator* comm, cudaStream_t stream) { @@ -115,11 +119,14 @@ int allreduce2_userbuff_inplace_rmsnorm_quant_fp4_launcher(int const handler, si scale_offset, elements, hidden_size, beta, gamma, eps, scalefactor, residual_in, residual_out, dataType, comm, stream); } -} // namespace tensorrt_llm::kernels::ub +} // namespace kernels::ub + +TRTLLM_NAMESPACE_END + #else namespace tensorrt_llm::runtime::ub { -void ub_initialize(tensorrt_llm::runtime::WorldConfig const& world_config) {} +void ub_initialize(::tensorrt_llm::runtime::WorldConfig const& world_config) {} void ub_initialize(int tp_size) {} @@ -151,10 +158,12 @@ bool ub_supported() } }; // namespace tensorrt_llm::runtime::ub -namespace tensorrt_llm::kernels::ub -{ using namespace tensorrt_llm::runtime::ub; +TRTLLM_NAMESPACE_BEGIN + +namespace kernels::ub +{ void allreduce2_userbuff_inplace_launcher(int const handler, size_t const offset, size_t const elements, nvinfer1::DataType dataType, communicator* comm, cudaStream_t stream) { @@ -182,5 +191,7 @@ int allreduce2_userbuff_inplace_rmsnorm_quant_fp4_launcher(int const handler, si { return 0; } -} // namespace tensorrt_llm::kernels::ub +} // namespace kernels::ub + +TRTLLM_NAMESPACE_END #endif diff --git a/cpp/tensorrt_llm/kernels/userbuffers/ub_interface.h b/cpp/tensorrt_llm/kernels/userbuffers/ub_interface.h index a33dd0ac58..e8a48e2c68 100644 --- a/cpp/tensorrt_llm/kernels/userbuffers/ub_interface.h +++ b/cpp/tensorrt_llm/kernels/userbuffers/ub_interface.h @@ -15,13 +15,14 @@ */ #pragma once #include "cuda_runtime.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaUtils.h" #include "tensorrt_llm/common/dataType.h" #include "ub_allocator.h" namespace tensorrt_llm::runtime::ub { -void ub_initialize(tensorrt_llm::runtime::WorldConfig const& world_config); +void ub_initialize(::tensorrt_llm::runtime::WorldConfig const& world_config); void ub_initialize(int tp_size); bool ub_is_initialized(); UBBuffer ub_allocate(size_t bytes); @@ -31,9 +32,13 @@ communicator* ub_comm(); bool ub_supported(); }; // namespace tensorrt_llm::runtime::ub -namespace tensorrt_llm::kernels::ub +TRTLLM_NAMESPACE_BEGIN + +namespace kernels::ub { -using namespace tensorrt_llm::runtime::ub; + +using ::tensorrt_llm::runtime::ub::communicator; + void allreduce2_userbuff_inplace_launcher(int const handler, size_t const offset, size_t const elements, nvinfer1::DataType dataType, communicator* comm, cudaStream_t stream = 0); @@ -53,4 +58,6 @@ int allreduce2_userbuff_inplace_rmsnorm_quant_fp4_launcher(int const handler, si int const out_handler, size_t const out_offset, int const scale_handler, size_t const scale_offset, size_t const elements, int const hidden_size, void* beta, void* gamma, float eps, float* scalefactor, void* residual_in, void* residual_out, nvinfer1::DataType dataType, communicator* comm, cudaStream_t stream); -} // namespace tensorrt_llm::kernels::ub +} // namespace kernels::ub + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/userbuffers/userbuffers-host.cpp b/cpp/tensorrt_llm/kernels/userbuffers/userbuffers-host.cpp index daba59b35a..be4d5e0c2e 100644 --- a/cpp/tensorrt_llm/kernels/userbuffers/userbuffers-host.cpp +++ b/cpp/tensorrt_llm/kernels/userbuffers/userbuffers-host.cpp @@ -123,7 +123,7 @@ void ub_free(void* ptr) } } // namespace -int create_communicator_grouped2(communicator** comm, tensorrt_llm::runtime::WorldConfig const& world_config) +int create_communicator_grouped2(communicator** comm, ::tensorrt_llm::runtime::WorldConfig const& world_config) { *comm = (communicator*) malloc(sizeof(communicator)); diff --git a/cpp/tensorrt_llm/kernels/userbuffers/userbuffers.cu b/cpp/tensorrt_llm/kernels/userbuffers/userbuffers.cu index 52956d9f9e..8cb5814e03 100644 --- a/cpp/tensorrt_llm/kernels/userbuffers/userbuffers.cu +++ b/cpp/tensorrt_llm/kernels/userbuffers/userbuffers.cu @@ -14,13 +14,15 @@ * limitations under the License. */ +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/kernels/quantization.cuh" #include "userbuffers.h" #include "utils.h" -namespace tensorrt_llm::kernels::ub +TRTLLM_NAMESPACE_BEGIN + +namespace kernels::ub { -using namespace tensorrt_llm::runtime::ub; #define MAX_THREADS 1024 #define TIMEOUT 200000000000ull @@ -1953,4 +1955,6 @@ int allreduce2_userbuff_inplace_rmsnorm_quant_fp4_impl(int const handler, size_t default: TLLM_THROW("Unsupported dataType for allreduce2_userbuff_inplace_rmsnorm_quant_impl"); } } -} // namespace tensorrt_llm::kernels::ub +} // namespace kernels::ub + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/userbuffers/userbuffers.h b/cpp/tensorrt_llm/kernels/userbuffers/userbuffers.h index 9751f969d5..96f21b7482 100644 --- a/cpp/tensorrt_llm/kernels/userbuffers/userbuffers.h +++ b/cpp/tensorrt_llm/kernels/userbuffers/userbuffers.h @@ -97,7 +97,7 @@ struct communicator }; using communicator = struct communicator; -int create_communicator_grouped2(communicator** comm, tensorrt_llm::runtime::WorldConfig const& world_config); +int create_communicator_grouped2(communicator** comm, ::tensorrt_llm::runtime::WorldConfig const& world_config); /* creates communicator with allreduce1 to happen in datagpus x datanodes groups, allreduce2 to happen in tensorgpus x tensor nodes, @@ -114,9 +114,11 @@ int register_user_buffer_collective(void** gpubuff, size_t bytes, communicator* void destroy_communicator(communicator* comm); } // namespace tensorrt_llm::runtime::ub -namespace tensorrt_llm::kernels::ub +TRTLLM_NAMESPACE_BEGIN + +namespace kernels::ub { -using namespace tensorrt_llm::runtime::ub; +using namespace ::tensorrt_llm::runtime::ub; void allreduce2_userbuff_inplace_impl(int const handler, size_t const offset, size_t const elements, nvinfer1::DataType dataType, communicator* comm, cudaStream_t stream = 0); // for TP-parallelism, only single node is implemented @@ -137,4 +139,6 @@ int allreduce2_userbuff_inplace_rmsnorm_quant_fp4_impl(int const handler, size_t size_t const out_offset, int const scale_handler, size_t const scale_offset, size_t const elements, int const hidden_size, void* beta, void* gamma, float eps, float* scalefactor, void* residual_in, void* residual_out, nvinfer1::DataType dataType, communicator* comm, cudaStream_t stream); -} // namespace tensorrt_llm::kernels::ub +} // namespace kernels::ub + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/common.h b/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/common.h index c8228f7d1c..c8c5f10f8a 100644 --- a/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/common.h +++ b/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/common.h @@ -15,6 +15,7 @@ */ #pragma once +#include "tensorrt_llm/common/config.h" #include #include #include @@ -24,8 +25,8 @@ #include #include -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { namespace weight_only @@ -102,4 +103,5 @@ struct Params }; } // namespace weight_only } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/converter.h b/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/converter.h index 463f3f7fe2..0bb32bdca6 100644 --- a/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/converter.h +++ b/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/converter.h @@ -16,12 +16,13 @@ #pragma once +#include "tensorrt_llm/common/config.h" #include #include #include -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { namespace weight_only @@ -79,4 +80,5 @@ struct I2FConverter } // namespace weight_only } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/cudaCoreGemm.cu b/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/cudaCoreGemm.cu index 94488579ec..c60d8f9d88 100644 --- a/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/cudaCoreGemm.cu +++ b/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/cudaCoreGemm.cu @@ -15,11 +15,12 @@ */ #include "cutlass/numeric_conversion.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/kernels/weightOnlyBatchedGemv/cudaCoreGemm.h" #include -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { namespace cuda_core_gemm @@ -330,4 +331,5 @@ bool cudaCoreGemmDispatcher(Params const& params, cudaStream_t stream) } // namespace cuda_core_gemm } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/cudaCoreGemm.h b/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/cudaCoreGemm.h index dd4a72d1b8..eb939b57c2 100644 --- a/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/cudaCoreGemm.h +++ b/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/cudaCoreGemm.h @@ -16,6 +16,7 @@ #pragma once #include "tensorrt_llm/common/assert.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaUtils.h" #include "tensorrt_llm/common/envUtils.h" #include "tensorrt_llm/common/logger.h" @@ -35,8 +36,8 @@ #include #include -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { namespace cuda_core_gemm @@ -95,4 +96,5 @@ struct Params bool cudaCoreGemmDispatcher(Params const& params, cudaStream_t stream); } // namespace cuda_core_gemm } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/cudaCoreGemmNVFP4.cu b/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/cudaCoreGemmNVFP4.cu index 5752c79332..1d208a293b 100644 --- a/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/cudaCoreGemmNVFP4.cu +++ b/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/cudaCoreGemmNVFP4.cu @@ -15,12 +15,13 @@ */ #include "cutlass/numeric_conversion.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaFp8Utils.h" #include "tensorrt_llm/kernels/weightOnlyBatchedGemv/cudaCoreGemmNVFP4.h" #include -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { namespace cuda_core_gemm_nvfp4 @@ -290,4 +291,5 @@ bool cudaCoreGemmDispatcher(Params const& params, cudaStream_t stream) } // namespace cuda_core_gemm_nvfp4 } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/cudaCoreGemmNVFP4.h b/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/cudaCoreGemmNVFP4.h index 2e37196d0d..d47d37c06a 100644 --- a/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/cudaCoreGemmNVFP4.h +++ b/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/cudaCoreGemmNVFP4.h @@ -16,6 +16,7 @@ #pragma once #include "tensorrt_llm/common/assert.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaUtils.h" #include "tensorrt_llm/common/envUtils.h" #include "tensorrt_llm/common/logger.h" @@ -35,8 +36,8 @@ #include #include -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { namespace cuda_core_gemm_nvfp4 @@ -78,4 +79,5 @@ struct Params bool cudaCoreGemmDispatcher(Params const& params, cudaStream_t stream); } // namespace cuda_core_gemm_nvfp4 } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/details.h b/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/details.h index 19dd66fa87..766d379112 100644 --- a/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/details.h +++ b/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/details.h @@ -15,10 +15,11 @@ */ #pragma once +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/kernels/weightOnlyBatchedGemv/common.h" -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { namespace weight_only @@ -145,4 +146,5 @@ struct KernelDetails } // namespace weight_only } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/int8SQ.cu b/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/int8SQ.cu index 8804da4e52..96aa3e0d91 100644 --- a/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/int8SQ.cu +++ b/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/int8SQ.cu @@ -14,10 +14,11 @@ * limitations under the License. */ +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/kernels/weightOnlyBatchedGemv/int8SQ.h" -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { namespace smooth_quant @@ -172,4 +173,5 @@ template void int8_sq_launcher<__nv_bfloat16>(Params& params, cudaStream_t s); #endif } // namespace smooth_quant } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/int8SQ.h b/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/int8SQ.h index fa247e279a..d33e6a331d 100644 --- a/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/int8SQ.h +++ b/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/int8SQ.h @@ -15,6 +15,7 @@ */ #pragma once +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/quantization.h" #include #include @@ -25,8 +26,8 @@ #include #include -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { namespace smooth_quant @@ -60,4 +61,5 @@ template void int8_sq_launcher(Params& params, cudaStream_t s); } // namespace smooth_quant } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/kernel.h b/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/kernel.h index de4a960e14..be95976465 100644 --- a/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/kernel.h +++ b/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/kernel.h @@ -16,11 +16,12 @@ #pragma once +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/kernels/weightOnlyBatchedGemv/common.h" #include "tensorrt_llm/kernels/weightOnlyBatchedGemv/utility.h" -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { namespace weight_only @@ -159,4 +160,5 @@ void exec_kernel(Params& params, cudaStream_t s) } // namespace weight_only } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelDispatcher.h b/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelDispatcher.h index 8a44f8aeaf..05bdcfab6c 100644 --- a/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelDispatcher.h +++ b/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelDispatcher.h @@ -16,11 +16,12 @@ #pragma once +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/kernels/weightOnlyBatchedGemv/common.h" #include "tensorrt_llm/kernels/weightOnlyBatchedGemv/kernel.h" -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { namespace weight_only @@ -171,4 +172,5 @@ void select_gs(Params& params, cudaStream_t s) KernelDetails>(Params & params, cudaStream_t s); } // namespace weight_only } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelDispatcherBf16Int4GroupwiseColumnMajorInterleavedForHopperTrue.cu b/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelDispatcherBf16Int4GroupwiseColumnMajorInterleavedForHopperTrue.cu index 75fe733145..1c1324d33f 100644 --- a/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelDispatcherBf16Int4GroupwiseColumnMajorInterleavedForHopperTrue.cu +++ b/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelDispatcherBf16Int4GroupwiseColumnMajorInterleavedForHopperTrue.cu @@ -14,10 +14,11 @@ * limitations under the License. */ +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelDispatcher.h" -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { namespace weight_only @@ -29,4 +30,5 @@ INSTANTIATE_WEIGHT_ONLY_CUDA_DISPATCHERS( KernelType::BF16Int4Groupwise, BF16DetailsA, Int4DetailsW, ColumnMajorInterleavedForHopper, true, 128); } // namespace weight_only } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelDispatcherBf16Int4GroupwiseColumnMajorInterleavedTrue.cu b/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelDispatcherBf16Int4GroupwiseColumnMajorInterleavedTrue.cu index 02892bcf72..26d856258c 100644 --- a/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelDispatcherBf16Int4GroupwiseColumnMajorInterleavedTrue.cu +++ b/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelDispatcherBf16Int4GroupwiseColumnMajorInterleavedTrue.cu @@ -14,10 +14,11 @@ * limitations under the License. */ +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelDispatcher.h" -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { namespace weight_only @@ -29,4 +30,5 @@ INSTANTIATE_WEIGHT_ONLY_CUDA_DISPATCHERS( KernelType::BF16Int4Groupwise, BF16DetailsA, Int4DetailsW, ColumnMajorInterleaved, true, 128); } // namespace weight_only } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelDispatcherBf16Int4PerChannelColumnMajorInterleavedForHopperTrue.cu b/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelDispatcherBf16Int4PerChannelColumnMajorInterleavedForHopperTrue.cu index 42d984c49f..6af1e8dc96 100644 --- a/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelDispatcherBf16Int4PerChannelColumnMajorInterleavedForHopperTrue.cu +++ b/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelDispatcherBf16Int4PerChannelColumnMajorInterleavedForHopperTrue.cu @@ -14,10 +14,11 @@ * limitations under the License. */ +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelDispatcher.h" -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { @@ -28,4 +29,5 @@ INSTANTIATE_WEIGHT_ONLY_CUDA_DISPATCHERS( } // namespace weight_only } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelDispatcherBf16Int4PerChannelColumnMajorInterleavedTrue.cu b/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelDispatcherBf16Int4PerChannelColumnMajorInterleavedTrue.cu index e1080ee620..9fd295a594 100644 --- a/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelDispatcherBf16Int4PerChannelColumnMajorInterleavedTrue.cu +++ b/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelDispatcherBf16Int4PerChannelColumnMajorInterleavedTrue.cu @@ -14,10 +14,11 @@ * limitations under the License. */ +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelDispatcher.h" -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { namespace weight_only @@ -26,4 +27,5 @@ INSTANTIATE_WEIGHT_ONLY_CUDA_DISPATCHERS( KernelType::BF16Int4PerChannel, BF16DetailsA, Int4DetailsW, ColumnMajorInterleaved, true, 64); } // namespace weight_only } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelDispatcherBf16Int8GroupwiseColumnMajoInterleavedForHopperTrue.cu b/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelDispatcherBf16Int8GroupwiseColumnMajoInterleavedForHopperTrue.cu index 41f69e246c..9c97b82d57 100644 --- a/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelDispatcherBf16Int8GroupwiseColumnMajoInterleavedForHopperTrue.cu +++ b/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelDispatcherBf16Int8GroupwiseColumnMajoInterleavedForHopperTrue.cu @@ -14,10 +14,11 @@ * limitations under the License. */ +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelDispatcher.h" -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { namespace weight_only @@ -26,4 +27,5 @@ INSTANTIATE_WEIGHT_ONLY_CUDA_DISPATCHERS( KernelType::BF16Int8Groupwise, BF16DetailsA, Int8DetailsW, ColumnMajorInterleavedForHopper, true, 64); } // namespace weight_only } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelDispatcherBf16Int8GroupwiseColumnMajorInterleavedTrue.cu b/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelDispatcherBf16Int8GroupwiseColumnMajorInterleavedTrue.cu index 6c718b24a9..adf02fcd45 100644 --- a/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelDispatcherBf16Int8GroupwiseColumnMajorInterleavedTrue.cu +++ b/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelDispatcherBf16Int8GroupwiseColumnMajorInterleavedTrue.cu @@ -14,10 +14,11 @@ * limitations under the License. */ +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelDispatcher.h" -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { namespace weight_only @@ -26,4 +27,5 @@ INSTANTIATE_WEIGHT_ONLY_CUDA_DISPATCHERS( KernelType::BF16Int8Groupwise, BF16DetailsA, Int8DetailsW, ColumnMajorInterleaved, true, 64); } // namespace weight_only } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelDispatcherBf16Int8PerChannelColumnMajorInterleavedForHopperTrue.cu b/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelDispatcherBf16Int8PerChannelColumnMajorInterleavedForHopperTrue.cu index 44d6ebbaf3..31f7e4115c 100644 --- a/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelDispatcherBf16Int8PerChannelColumnMajorInterleavedForHopperTrue.cu +++ b/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelDispatcherBf16Int8PerChannelColumnMajorInterleavedForHopperTrue.cu @@ -14,10 +14,11 @@ * limitations under the License. */ +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelDispatcher.h" -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { namespace weight_only @@ -26,4 +27,5 @@ INSTANTIATE_WEIGHT_ONLY_CUDA_DISPATCHERS( KernelType::BF16Int8PerChannel, BF16DetailsA, Int8DetailsW, ColumnMajorInterleavedForHopper, true, 64); } // namespace weight_only } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelDispatcherBf16Int8PerChannelColumnMajorInterleavedTrue.cu b/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelDispatcherBf16Int8PerChannelColumnMajorInterleavedTrue.cu index 7cee8ee139..29725cfe9c 100644 --- a/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelDispatcherBf16Int8PerChannelColumnMajorInterleavedTrue.cu +++ b/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelDispatcherBf16Int8PerChannelColumnMajorInterleavedTrue.cu @@ -14,10 +14,11 @@ * limitations under the License. */ +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelDispatcher.h" -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { namespace weight_only @@ -26,4 +27,5 @@ INSTANTIATE_WEIGHT_ONLY_CUDA_DISPATCHERS( KernelType::BF16Int8PerChannel, BF16DetailsA, Int8DetailsW, ColumnMajorInterleaved, true, 64); } // namespace weight_only } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelDispatcherFp16Int4GroupwiseColumnMajorInterleavedForHopperTrue.cu b/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelDispatcherFp16Int4GroupwiseColumnMajorInterleavedForHopperTrue.cu index 555f2db582..1662999bc4 100644 --- a/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelDispatcherFp16Int4GroupwiseColumnMajorInterleavedForHopperTrue.cu +++ b/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelDispatcherFp16Int4GroupwiseColumnMajorInterleavedForHopperTrue.cu @@ -14,10 +14,11 @@ * limitations under the License. */ +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelDispatcher.h" -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { namespace weight_only @@ -29,4 +30,5 @@ INSTANTIATE_WEIGHT_ONLY_CUDA_DISPATCHERS( KernelType::FP16Int4Groupwise, FP16DetailsA, Int4DetailsW, ColumnMajorInterleavedForHopper, true, 128); } // namespace weight_only } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelDispatcherFp16Int4GroupwiseColumnMajorInterleavedTrue.cu b/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelDispatcherFp16Int4GroupwiseColumnMajorInterleavedTrue.cu index e392da50da..371bcd73a3 100644 --- a/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelDispatcherFp16Int4GroupwiseColumnMajorInterleavedTrue.cu +++ b/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelDispatcherFp16Int4GroupwiseColumnMajorInterleavedTrue.cu @@ -14,10 +14,11 @@ * limitations under the License. */ +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelDispatcher.h" -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { namespace weight_only @@ -29,4 +30,5 @@ INSTANTIATE_WEIGHT_ONLY_CUDA_DISPATCHERS( KernelType::FP16Int4Groupwise, FP16DetailsA, Int4DetailsW, ColumnMajorInterleaved, true, 128); } // namespace weight_only } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelDispatcherFp16Int4PerChannelColumnMajorInterleavedForHopperTrue.cu b/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelDispatcherFp16Int4PerChannelColumnMajorInterleavedForHopperTrue.cu index 6a77b98cf3..6bbec17ccc 100644 --- a/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelDispatcherFp16Int4PerChannelColumnMajorInterleavedForHopperTrue.cu +++ b/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelDispatcherFp16Int4PerChannelColumnMajorInterleavedForHopperTrue.cu @@ -14,10 +14,11 @@ * limitations under the License. */ +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelDispatcher.h" -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { namespace weight_only @@ -26,4 +27,5 @@ INSTANTIATE_WEIGHT_ONLY_CUDA_DISPATCHERS( KernelType::FP16Int4PerChannel, FP16DetailsA, Int4DetailsW, ColumnMajorInterleavedForHopper, true, 64); } // namespace weight_only } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelDispatcherFp16Int4PerChannelColumnMajorInterleavedTrue.cu b/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelDispatcherFp16Int4PerChannelColumnMajorInterleavedTrue.cu index 08034547da..51ff227805 100644 --- a/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelDispatcherFp16Int4PerChannelColumnMajorInterleavedTrue.cu +++ b/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelDispatcherFp16Int4PerChannelColumnMajorInterleavedTrue.cu @@ -14,10 +14,11 @@ * limitations under the License. */ +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelDispatcher.h" -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { namespace weight_only @@ -26,4 +27,5 @@ INSTANTIATE_WEIGHT_ONLY_CUDA_DISPATCHERS( KernelType::FP16Int4PerChannel, FP16DetailsA, Int4DetailsW, ColumnMajorInterleaved, true, 64); } // namespace weight_only } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelDispatcherFp16Int8GroupwiseColumnMajorInterleavedForHopperTrue.cu b/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelDispatcherFp16Int8GroupwiseColumnMajorInterleavedForHopperTrue.cu index 8a3d0ee94a..eb0d3fb7ce 100644 --- a/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelDispatcherFp16Int8GroupwiseColumnMajorInterleavedForHopperTrue.cu +++ b/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelDispatcherFp16Int8GroupwiseColumnMajorInterleavedForHopperTrue.cu @@ -14,10 +14,11 @@ * limitations under the License. */ +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelDispatcher.h" -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { namespace weight_only @@ -26,4 +27,5 @@ INSTANTIATE_WEIGHT_ONLY_CUDA_DISPATCHERS( KernelType::FP16Int8Groupwise, FP16DetailsA, Int8DetailsW, ColumnMajorInterleavedForHopper, true, 64); } // namespace weight_only } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelDispatcherFp16Int8GroupwiseColumnMajorInterleavedTrue.cu b/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelDispatcherFp16Int8GroupwiseColumnMajorInterleavedTrue.cu index fa5002ae05..33225d078b 100644 --- a/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelDispatcherFp16Int8GroupwiseColumnMajorInterleavedTrue.cu +++ b/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelDispatcherFp16Int8GroupwiseColumnMajorInterleavedTrue.cu @@ -14,10 +14,11 @@ * limitations under the License. */ +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelDispatcher.h" -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { namespace weight_only @@ -26,4 +27,5 @@ INSTANTIATE_WEIGHT_ONLY_CUDA_DISPATCHERS( KernelType::FP16Int8Groupwise, FP16DetailsA, Int8DetailsW, ColumnMajorInterleaved, true, 64); } // namespace weight_only } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelDispatcherFp16Int8PerChannelColumnMajorInterleavedForHopperTrue.cu b/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelDispatcherFp16Int8PerChannelColumnMajorInterleavedForHopperTrue.cu index f8eeb0dfd9..0b66b130bd 100644 --- a/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelDispatcherFp16Int8PerChannelColumnMajorInterleavedForHopperTrue.cu +++ b/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelDispatcherFp16Int8PerChannelColumnMajorInterleavedForHopperTrue.cu @@ -14,10 +14,11 @@ * limitations under the License. */ +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelDispatcher.h" -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { namespace weight_only @@ -26,4 +27,5 @@ INSTANTIATE_WEIGHT_ONLY_CUDA_DISPATCHERS( KernelType::FP16Int8PerChannel, FP16DetailsA, Int8DetailsW, ColumnMajorInterleavedForHopper, true, 64); } // namespace weight_only } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelDispatcherFp16Int8PerChannelColumnMajorInterleavedTrue.cu b/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelDispatcherFp16Int8PerChannelColumnMajorInterleavedTrue.cu index 626e99bc50..d6932b9348 100644 --- a/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelDispatcherFp16Int8PerChannelColumnMajorInterleavedTrue.cu +++ b/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelDispatcherFp16Int8PerChannelColumnMajorInterleavedTrue.cu @@ -14,10 +14,11 @@ * limitations under the License. */ +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelDispatcher.h" -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { namespace weight_only @@ -26,4 +27,5 @@ INSTANTIATE_WEIGHT_ONLY_CUDA_DISPATCHERS( KernelType::FP16Int8PerChannel, FP16DetailsA, Int8DetailsW, ColumnMajorInterleaved, true, 64); } // namespace weight_only } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelLauncher.h b/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelLauncher.h index 0ca925d3a5..4562562754 100644 --- a/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelLauncher.h +++ b/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelLauncher.h @@ -15,12 +15,13 @@ */ #pragma once +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaUtils.h" #include "tensorrt_llm/kernels/weightOnlyBatchedGemv/common.h" #include "tensorrt_llm/kernels/weightOnlyBatchedGemv/details.h" -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { namespace weight_only @@ -112,4 +113,5 @@ inline bool is_supported(int arch, KernelType kernel_type) } } // namespace weight_only } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/utility.h b/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/utility.h index 4e660f0d60..2d5d2a2ee7 100644 --- a/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/utility.h +++ b/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/utility.h @@ -16,11 +16,12 @@ #pragma once +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/kernels/weightOnlyBatchedGemv/converter.h" #include "tensorrt_llm/kernels/weightOnlyBatchedGemv/details.h" -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { namespace weight_only @@ -330,4 +331,5 @@ private: }; } // namespace weight_only } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/xqaDispatcher.cpp b/cpp/tensorrt_llm/kernels/xqaDispatcher.cpp index 9bc7513aea..458b6983d8 100644 --- a/cpp/tensorrt_llm/kernels/xqaDispatcher.cpp +++ b/cpp/tensorrt_llm/kernels/xqaDispatcher.cpp @@ -15,6 +15,7 @@ */ #include "xqaDispatcher.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaUtils.h" #include "tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplCommon.h" #include "tensorrt_llm/kernels/sparseAttentionKernels.h" @@ -38,7 +39,9 @@ constexpr inline T roundUp(T a, T b) } // namespace -namespace tensorrt_llm::kernels +TRTLLM_NAMESPACE_BEGIN + +namespace kernels { namespace @@ -538,4 +541,6 @@ void XqaDispatcher::run( //////////////////////////////////////////////////////////////////////////////////////////////////// -} // namespace tensorrt_llm::kernels +} // namespace kernels + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/xqaDispatcher.h b/cpp/tensorrt_llm/kernels/xqaDispatcher.h index 784b30eda8..8888beddb8 100644 --- a/cpp/tensorrt_llm/kernels/xqaDispatcher.h +++ b/cpp/tensorrt_llm/kernels/xqaDispatcher.h @@ -16,6 +16,7 @@ #pragma once +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/opUtils.h" #include "tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQARunner.h" #include "tensorrt_llm/kernels/kvCacheUtils.h" @@ -25,7 +26,9 @@ using namespace tensorrt_llm::common; using tensorrt_llm::common::op::UniqPtrWNullCopy; -namespace tensorrt_llm::kernels +TRTLLM_NAMESPACE_BEGIN + +namespace kernels { //////////////////////////////////////////////////////////////////////////////////////////////////// @@ -114,4 +117,6 @@ constexpr uint32_t xqaMlaCgaXBufSize = 8704 * 2; //////////////////////////////////////////////////////////////////////////////////////////////////// -} // namespace tensorrt_llm::kernels +} // namespace kernels + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/nanobind/userbuffers/bindings.cpp b/cpp/tensorrt_llm/nanobind/userbuffers/bindings.cpp index 8688f8e79c..b6e42df465 100644 --- a/cpp/tensorrt_llm/nanobind/userbuffers/bindings.cpp +++ b/cpp/tensorrt_llm/nanobind/userbuffers/bindings.cpp @@ -16,6 +16,7 @@ */ #include "bindings.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/kernels/userbuffers/ub_interface.h" #include "tensorrt_llm/kernels/userbuffers/userbuffersManager.h" #include "tensorrt_llm/nanobind/common/customCasters.h" @@ -24,7 +25,9 @@ namespace nb = nanobind; namespace tub = tensorrt_llm::runtime::ub; -namespace tensorrt_llm::kernels::userbuffers +TRTLLM_NAMESPACE_BEGIN + +namespace kernels::userbuffers { void UserBufferBindings::initBindings(nb::module_& m) @@ -49,4 +52,6 @@ void UserBufferBindings::initBindings(nb::module_& m) m.def("initialize_userbuffers_manager", &tub::initialize_userbuffers_manager, nb::call_guard()); } -} // namespace tensorrt_llm::kernels::userbuffers +} // namespace kernels::userbuffers + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/nanobind/userbuffers/bindings.h b/cpp/tensorrt_llm/nanobind/userbuffers/bindings.h index 15728bf6c1..6956aac5bd 100644 --- a/cpp/tensorrt_llm/nanobind/userbuffers/bindings.h +++ b/cpp/tensorrt_llm/nanobind/userbuffers/bindings.h @@ -17,14 +17,20 @@ #pragma once +#include "tensorrt_llm/common/config.h" #include + namespace nb = nanobind; -namespace tensorrt_llm::kernels::userbuffers +TRTLLM_NAMESPACE_BEGIN + +namespace kernels::userbuffers { class UserBufferBindings { public: static void initBindings(nb::module_& m); }; -} // namespace tensorrt_llm::kernels::userbuffers +} // namespace kernels::userbuffers + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/pybind/userbuffers/bindings.cpp b/cpp/tensorrt_llm/pybind/userbuffers/bindings.cpp index 58f4bfa85c..743df47309 100644 --- a/cpp/tensorrt_llm/pybind/userbuffers/bindings.cpp +++ b/cpp/tensorrt_llm/pybind/userbuffers/bindings.cpp @@ -16,13 +16,16 @@ */ #include "bindings.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/kernels/userbuffers/ub_interface.h" #include "tensorrt_llm/kernels/userbuffers/userbuffersManager.h" namespace py = pybind11; namespace tub = tensorrt_llm::runtime::ub; -namespace tensorrt_llm::kernels::userbuffers +TRTLLM_NAMESPACE_BEGIN + +namespace kernels::userbuffers { void UserBufferBindings::initBindings(pybind11::module_& m) @@ -47,4 +50,6 @@ void UserBufferBindings::initBindings(pybind11::module_& m) m.def("initialize_userbuffers_manager", &tub::initialize_userbuffers_manager, py::call_guard()); } -} // namespace tensorrt_llm::kernels::userbuffers +} // namespace kernels::userbuffers + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/pybind/userbuffers/bindings.h b/cpp/tensorrt_llm/pybind/userbuffers/bindings.h index 3a8fba2cc6..1895dc7543 100644 --- a/cpp/tensorrt_llm/pybind/userbuffers/bindings.h +++ b/cpp/tensorrt_llm/pybind/userbuffers/bindings.h @@ -17,14 +17,19 @@ #pragma once +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/pybind/common/customCasters.h" #include -namespace tensorrt_llm::kernels::userbuffers +TRTLLM_NAMESPACE_BEGIN + +namespace kernels::userbuffers { class UserBufferBindings { public: static void initBindings(pybind11::module_& m); }; -} // namespace tensorrt_llm::kernels::userbuffers +} // namespace kernels::userbuffers + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/runtime/gptDecoderBatched.cpp b/cpp/tensorrt_llm/runtime/gptDecoderBatched.cpp index 916062d3cd..3fcb38822a 100644 --- a/cpp/tensorrt_llm/runtime/gptDecoderBatched.cpp +++ b/cpp/tensorrt_llm/runtime/gptDecoderBatched.cpp @@ -127,7 +127,7 @@ void prepareForward(decoder::DecoderState const& decoderState, SizeType32 step, auto batchSlotsRange = BufferRange(*dInput.batchSlots); for (auto batchSlot : batchSlotsRange) { - TensorPtr finishedStepsSlice = ITensor::slice(decoderState.getFinishReasons(), batchSlot, 1); + ::TensorPtr finishedStepsSlice = ITensor::slice(decoderState.getFinishReasons(), batchSlot, 1); bufferManager.setZero(*finishedStepsSlice); } } diff --git a/cpp/tensorrt_llm/thop/IndexerKCacheScatterOp.cpp b/cpp/tensorrt_llm/thop/IndexerKCacheScatterOp.cpp index b94674f1ca..940d59258c 100644 --- a/cpp/tensorrt_llm/thop/IndexerKCacheScatterOp.cpp +++ b/cpp/tensorrt_llm/thop/IndexerKCacheScatterOp.cpp @@ -23,6 +23,8 @@ namespace th = torch; namespace tl = tensorrt_llm; namespace tk = tensorrt_llm::kernels; +TRTLLM_NAMESPACE_BEGIN + namespace torch_ext { @@ -93,6 +95,8 @@ void indexer_k_cache_scatter_op(th::Tensor const& k_fp8_bytes, th::Tensor const& } // namespace torch_ext +TRTLLM_NAMESPACE_END + TORCH_LIBRARY_FRAGMENT(trtllm, m) { m.def( @@ -102,5 +106,5 @@ TORCH_LIBRARY_FRAGMENT(trtllm, m) TORCH_LIBRARY_IMPL(trtllm, CUDA, m) { - m.impl("indexer_k_cache_scatter_op", &torch_ext::indexer_k_cache_scatter_op); + m.impl("indexer_k_cache_scatter_op", &tensorrt_llm::torch_ext::indexer_k_cache_scatter_op); } diff --git a/cpp/tensorrt_llm/thop/IndexerTopKOp.cpp b/cpp/tensorrt_llm/thop/IndexerTopKOp.cpp index 8a5003238c..d5a1917fbd 100644 --- a/cpp/tensorrt_llm/thop/IndexerTopKOp.cpp +++ b/cpp/tensorrt_llm/thop/IndexerTopKOp.cpp @@ -31,6 +31,8 @@ namespace th = torch; namespace tl = tensorrt_llm; namespace tk = tensorrt_llm::kernels; +TRTLLM_NAMESPACE_BEGIN + namespace torch_ext { @@ -118,8 +120,11 @@ void indexer_topk_prefill(th::Tensor const& logits, th::Tensor const& row_starts indices.data_ptr(), num_rows, num_columns, static_cast(logits_stride_0), static_cast(logits_stride_1), static_cast(index_topk), stream); } + } // end namespace torch_ext +TRTLLM_NAMESPACE_END + TORCH_LIBRARY_FRAGMENT(trtllm, m) { m.def( @@ -129,7 +134,7 @@ TORCH_LIBRARY_FRAGMENT(trtllm, m) TORCH_LIBRARY_IMPL(trtllm, CUDA, m) { - m.impl("indexer_topk_decode", &torch_ext::indexer_topk_decode); + m.impl("indexer_topk_decode", &tensorrt_llm::torch_ext::indexer_topk_decode); } TORCH_LIBRARY_FRAGMENT(trtllm, m) @@ -141,5 +146,5 @@ TORCH_LIBRARY_FRAGMENT(trtllm, m) TORCH_LIBRARY_IMPL(trtllm, CUDA, m) { - m.impl("indexer_topk_prefill", &torch_ext::indexer_topk_prefill); + m.impl("indexer_topk_prefill", &tensorrt_llm::torch_ext::indexer_topk_prefill); } diff --git a/cpp/tensorrt_llm/thop/allgatherOp.cpp b/cpp/tensorrt_llm/thop/allgatherOp.cpp index 0ce8d99e58..0d92aa9669 100644 --- a/cpp/tensorrt_llm/thop/allgatherOp.cpp +++ b/cpp/tensorrt_llm/thop/allgatherOp.cpp @@ -35,6 +35,8 @@ using tensorrt_llm::pg_utils::PgHelper; +TRTLLM_NAMESPACE_BEGIN + namespace torch_ext { #if ENABLE_MULTI_DEVICE @@ -286,6 +288,8 @@ std::vector allgather_list_pg(torch::TensorList input_list, torch } // namespace torch_ext +TRTLLM_NAMESPACE_END + TORCH_LIBRARY_FRAGMENT(trtllm, m) { m.def("allgather(Tensor input, SymInt[]? sizes, int[] group) -> Tensor"); @@ -300,8 +304,8 @@ TORCH_LIBRARY_FRAGMENT(trtllm, m) TORCH_LIBRARY_IMPL(trtllm, CUDA, m) { - m.impl("allgather", &torch_ext::allgather); - m.impl("allgather_pg", &torch_ext::allgather_pg); - m.impl("allgather_list", &torch_ext::allgather_list); - m.impl("allgather_list_pg", &torch_ext::allgather_list_pg); + m.impl("allgather", &tensorrt_llm::torch_ext::allgather); + m.impl("allgather_pg", &tensorrt_llm::torch_ext::allgather_pg); + m.impl("allgather_list", &tensorrt_llm::torch_ext::allgather_list); + m.impl("allgather_list_pg", &tensorrt_llm::torch_ext::allgather_list_pg); } diff --git a/cpp/tensorrt_llm/thop/allreduceOp.cpp b/cpp/tensorrt_llm/thop/allreduceOp.cpp index fbd60d1ec5..c753242518 100644 --- a/cpp/tensorrt_llm/thop/allreduceOp.cpp +++ b/cpp/tensorrt_llm/thop/allreduceOp.cpp @@ -65,6 +65,8 @@ using tensorrt_llm::pg_utils::get_world_pg; using tensorrt_llm::pg_utils::get_local_pg; using tensorrt_llm::pg_utils::PgHelper; +TRTLLM_NAMESPACE_BEGIN + namespace torch_ext { @@ -1528,6 +1530,8 @@ std::vector mnnvlFusionAllReduce(torch::Tensor& input, torch::opt } // namespace torch_ext +TRTLLM_NAMESPACE_END + TORCH_LIBRARY_FRAGMENT(trtllm, m) { m.def( @@ -1591,11 +1595,11 @@ TORCH_LIBRARY_FRAGMENT(trtllm, m) TORCH_LIBRARY_IMPL(trtllm, CUDA, m) { - m.impl("mnnvl_fusion_allreduce", &torch_ext::mnnvlFusionAllReduce); - m.impl("allreduce", &torch_ext::allreduce_raw); - m.impl("allreduce_pg", &torch_ext::allreduce_pg); - m.impl("moe_allreduce", &torch_ext::moe_allreduce); - m.impl("moe_finalize_allreduce", &torch_ext::moe_finalize_allreduce); + m.impl("mnnvl_fusion_allreduce", &tensorrt_llm::torch_ext::mnnvlFusionAllReduce); + m.impl("allreduce", &tensorrt_llm::torch_ext::allreduce_raw); + m.impl("allreduce_pg", &tensorrt_llm::torch_ext::allreduce_pg); + m.impl("moe_allreduce", &tensorrt_llm::torch_ext::moe_allreduce); + m.impl("moe_finalize_allreduce", &tensorrt_llm::torch_ext::moe_finalize_allreduce); } TORCH_LIBRARY_IMPL(trtllm, CPU, m) diff --git a/cpp/tensorrt_llm/thop/alltoallOp.cpp b/cpp/tensorrt_llm/thop/alltoallOp.cpp index fdc691575b..61c09466db 100644 --- a/cpp/tensorrt_llm/thop/alltoallOp.cpp +++ b/cpp/tensorrt_llm/thop/alltoallOp.cpp @@ -30,6 +30,8 @@ #include #endif // ENABLE_MULTI_DEVICE +TRTLLM_NAMESPACE_BEGIN + namespace torch_ext { #if ENABLE_MULTI_DEVICE @@ -119,6 +121,8 @@ std::vector alltoall_helix( } // namespace torch_ext +TRTLLM_NAMESPACE_END + TORCH_LIBRARY_FRAGMENT(trtllm, m) { m.def("alltoall_helix(Tensor[] input_list, int[] group, int? num_lists) -> Tensor[]"); @@ -126,5 +130,5 @@ TORCH_LIBRARY_FRAGMENT(trtllm, m) TORCH_LIBRARY_IMPL(trtllm, CUDA, m) { - m.impl("alltoall_helix", &torch_ext::alltoall_helix); + m.impl("alltoall_helix", &tensorrt_llm::torch_ext::alltoall_helix); } diff --git a/cpp/tensorrt_llm/thop/attentionOp.cpp b/cpp/tensorrt_llm/thop/attentionOp.cpp index cbb498fcf8..1fb1ce1d62 100644 --- a/cpp/tensorrt_llm/thop/attentionOp.cpp +++ b/cpp/tensorrt_llm/thop/attentionOp.cpp @@ -29,6 +29,8 @@ #include #include +TRTLLM_NAMESPACE_BEGIN + namespace torch_ext { using tensorrt_llm::common::op::AttentionOp; @@ -964,7 +966,9 @@ bool attention_supports_nvfp4_output(int64_t const num_heads, int64_t const num_ } // namespace torch_ext +TRTLLM_NAMESPACE_END + TORCH_LIBRARY_FRAGMENT(trtllm, m) { - m.def("attention_supports_nvfp4_output", &torch_ext::attention_supports_nvfp4_output); + m.def("attention_supports_nvfp4_output", &tensorrt_llm::torch_ext::attention_supports_nvfp4_output); } diff --git a/cpp/tensorrt_llm/thop/attentionOp.h b/cpp/tensorrt_llm/thop/attentionOp.h index d15a33d528..712f7b9257 100644 --- a/cpp/tensorrt_llm/thop/attentionOp.h +++ b/cpp/tensorrt_llm/thop/attentionOp.h @@ -19,6 +19,10 @@ #include #include +#include "tensorrt_llm/common/config.h" + +TRTLLM_NAMESPACE_BEGIN + namespace torch_ext { @@ -69,3 +73,5 @@ void attention(torch::Tensor q, std::optional k, std::optional mla_bmm2_scale, std::optional quant_q_buffer); } // namespace torch_ext + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/thop/causalConv1dOp.cpp b/cpp/tensorrt_llm/thop/causalConv1dOp.cpp index 9201cdb7e3..0d4a13672b 100644 --- a/cpp/tensorrt_llm/thop/causalConv1dOp.cpp +++ b/cpp/tensorrt_llm/thop/causalConv1dOp.cpp @@ -24,6 +24,8 @@ #include #include +TRTLLM_NAMESPACE_BEGIN + namespace torch_ext { @@ -289,6 +291,8 @@ void causalConv1dUpdate(at::Tensor const& x, at::Tensor const& conv_state, at::T } // namespace torch_ext +TRTLLM_NAMESPACE_END + TORCH_LIBRARY_FRAGMENT(trtllm, m) { m.def( @@ -315,6 +319,6 @@ TORCH_LIBRARY_FRAGMENT(trtllm, m) TORCH_LIBRARY_IMPL(trtllm, CUDA, m) { - m.impl("causal_conv1d_fwd", &torch_ext::causalConv1dFwd); - m.impl("causal_conv1d_update", &torch_ext::causalConv1dUpdate); + m.impl("causal_conv1d_fwd", &tensorrt_llm::torch_ext::causalConv1dFwd); + m.impl("causal_conv1d_update", &tensorrt_llm::torch_ext::causalConv1dUpdate); } diff --git a/cpp/tensorrt_llm/thop/convertSpecDecodingMaskToPackedMaskOp.cpp b/cpp/tensorrt_llm/thop/convertSpecDecodingMaskToPackedMaskOp.cpp index 5cbd2ba0de..a3ddc746e4 100644 --- a/cpp/tensorrt_llm/thop/convertSpecDecodingMaskToPackedMaskOp.cpp +++ b/cpp/tensorrt_llm/thop/convertSpecDecodingMaskToPackedMaskOp.cpp @@ -19,6 +19,8 @@ namespace th = torch; +TRTLLM_NAMESPACE_BEGIN + namespace torch_ext { void convertSpecDecodingMaskToPackedMask(torch::Tensor specDecodingGenerationLengthsTensor, @@ -81,5 +83,8 @@ void convertSpecDecodingMaskToPackedMask(torch::Tensor specDecodingGenerationLen } // namespace torch_ext -static auto convert_spec_decoding_mask_to_packed_mask = torch::RegisterOperators( - "tensorrt_llm::convert_spec_decoding_mask_to_packed_mask", &torch_ext::convertSpecDecodingMaskToPackedMask); +TRTLLM_NAMESPACE_END + +static auto convert_spec_decoding_mask_to_packed_mask + = torch::RegisterOperators("tensorrt_llm::convert_spec_decoding_mask_to_packed_mask", + &tensorrt_llm::torch_ext::convertSpecDecodingMaskToPackedMask); diff --git a/cpp/tensorrt_llm/thop/cublasFp4ScaledMM.cpp b/cpp/tensorrt_llm/thop/cublasFp4ScaledMM.cpp index 02eae46d74..77ad23c0ab 100644 --- a/cpp/tensorrt_llm/thop/cublasFp4ScaledMM.cpp +++ b/cpp/tensorrt_llm/thop/cublasFp4ScaledMM.cpp @@ -27,6 +27,8 @@ using torch::Tensor; +TRTLLM_NAMESPACE_BEGIN + namespace torch_ext { @@ -427,10 +429,12 @@ private: } // namespace torch_ext +TRTLLM_NAMESPACE_END + TORCH_LIBRARY_FRAGMENT(trtllm, m) { - m.class_("CublasLtFP4GemmRunner") + m.class_("CublasLtFP4GemmRunner") .def(torch::init()) - .def("run_gemm", &torch_ext::CublasLtFP4GemmRunner::runGemm) - .def("get_num_heuristic_algos", &torch_ext::CublasLtFP4GemmRunner::getNumHeuristicAlgos); + .def("run_gemm", &tensorrt_llm::torch_ext::CublasLtFP4GemmRunner::runGemm) + .def("get_num_heuristic_algos", &tensorrt_llm::torch_ext::CublasLtFP4GemmRunner::getNumHeuristicAlgos); } diff --git a/cpp/tensorrt_llm/thop/cublasScaledMM.cpp b/cpp/tensorrt_llm/thop/cublasScaledMM.cpp index 8baeba022b..ddf8024b91 100644 --- a/cpp/tensorrt_llm/thop/cublasScaledMM.cpp +++ b/cpp/tensorrt_llm/thop/cublasScaledMM.cpp @@ -29,6 +29,8 @@ using torch::Tensor; +TRTLLM_NAMESPACE_BEGIN + namespace torch_ext { @@ -377,6 +379,8 @@ Tensor cublas_mm(Tensor const& mat_a, Tensor const& mat_b, std::optional #include +#include "tensorrt_llm/common/config.h" + namespace th = torch; +TRTLLM_NAMESPACE_BEGIN + namespace torch_ext { th::Tensor& cublas_mm_out( @@ -34,3 +38,5 @@ th::Tensor cublas_scaled_mm(th::Tensor const& mat_a, th::Tensor const& mat_b, th th::Tensor cublas_scaled_mm_out(th::Tensor const& mat_a, th::Tensor const& mat_b, th::Tensor const& scale_a, th::Tensor const& scale_b, std::optional const& bias, th::Tensor& out); } // namespace torch_ext + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/thop/cudaNvfp4MM.cpp b/cpp/tensorrt_llm/thop/cudaNvfp4MM.cpp index 8a8ddb32e2..bcd9d9d62e 100644 --- a/cpp/tensorrt_llm/thop/cudaNvfp4MM.cpp +++ b/cpp/tensorrt_llm/thop/cudaNvfp4MM.cpp @@ -24,6 +24,8 @@ using torch::Tensor; +TRTLLM_NAMESPACE_BEGIN + namespace torch_ext { @@ -131,6 +133,8 @@ Tensor cuda_core_nvfp4_gemm(Tensor const& mat_a, Tensor const& mat_b, Tensor con } // namespace torch_ext +TRTLLM_NAMESPACE_END + TORCH_LIBRARY_FRAGMENT(trtllm, m) { m.def( @@ -140,5 +144,5 @@ TORCH_LIBRARY_FRAGMENT(trtllm, m) TORCH_LIBRARY_IMPL(trtllm, CUDA, m) { - m.impl("cuda_core_nvfp4_gemm", &torch_ext::cuda_core_nvfp4_gemm); + m.impl("cuda_core_nvfp4_gemm", &tensorrt_llm::torch_ext::cuda_core_nvfp4_gemm); } diff --git a/cpp/tensorrt_llm/thop/cudaScaledMM.cpp b/cpp/tensorrt_llm/thop/cudaScaledMM.cpp index 60a7358f5a..db4713f60e 100644 --- a/cpp/tensorrt_llm/thop/cudaScaledMM.cpp +++ b/cpp/tensorrt_llm/thop/cudaScaledMM.cpp @@ -24,6 +24,8 @@ using torch::Tensor; +TRTLLM_NAMESPACE_BEGIN + namespace torch_ext { @@ -121,6 +123,8 @@ Tensor cuda_scaled_mm(Tensor const& mat_a, Tensor const& mat_b, Tensor const& sc } // namespace torch_ext +TRTLLM_NAMESPACE_END + TORCH_LIBRARY_FRAGMENT(trtllm, m) { m.def( @@ -130,5 +134,5 @@ TORCH_LIBRARY_FRAGMENT(trtllm, m) TORCH_LIBRARY_IMPL(trtllm, CUDA, m) { - m.impl("cuda_scaled_mm", &torch_ext::cuda_scaled_mm); + m.impl("cuda_scaled_mm", &tensorrt_llm::torch_ext::cuda_scaled_mm); } diff --git a/cpp/tensorrt_llm/thop/customMoeRoutingOp.cpp b/cpp/tensorrt_llm/thop/customMoeRoutingOp.cpp index 81fb4acf9c..e7f0164ab3 100644 --- a/cpp/tensorrt_llm/thop/customMoeRoutingOp.cpp +++ b/cpp/tensorrt_llm/thop/customMoeRoutingOp.cpp @@ -22,6 +22,8 @@ namespace th = torch; namespace tl = tensorrt_llm; namespace tk = tensorrt_llm::kernels; +TRTLLM_NAMESPACE_BEGIN + namespace torch_ext { template @@ -121,6 +123,8 @@ std::tuple default_moe_routing_op( } } // namespace torch_ext +TRTLLM_NAMESPACE_END + TORCH_LIBRARY_FRAGMENT(trtllm, m) { m.def( @@ -130,7 +134,7 @@ TORCH_LIBRARY_FRAGMENT(trtllm, m) TORCH_LIBRARY_IMPL(trtllm, CUDA, m) { - m.impl("renorm_moe_routing_op", &torch_ext::renorm_moe_routing_op); + m.impl("renorm_moe_routing_op", &tensorrt_llm::torch_ext::renorm_moe_routing_op); } TORCH_LIBRARY_FRAGMENT(trtllm, m) @@ -142,5 +146,5 @@ TORCH_LIBRARY_FRAGMENT(trtllm, m) TORCH_LIBRARY_IMPL(trtllm, CUDA, m) { - m.impl("default_moe_routing_op", &torch_ext::default_moe_routing_op); + m.impl("default_moe_routing_op", &tensorrt_llm::torch_ext::default_moe_routing_op); } diff --git a/cpp/tensorrt_llm/thop/cuteDslMoeUtilsOp.cpp b/cpp/tensorrt_llm/thop/cuteDslMoeUtilsOp.cpp index 54c45031a1..770c1459f9 100644 --- a/cpp/tensorrt_llm/thop/cuteDslMoeUtilsOp.cpp +++ b/cpp/tensorrt_llm/thop/cuteDslMoeUtilsOp.cpp @@ -20,6 +20,8 @@ #include +TRTLLM_NAMESPACE_BEGIN + namespace torch_ext { // Sort @@ -473,6 +475,8 @@ torch::Tensor moe_gelu(torch::Tensor const& input, torch::Tensor const& tile_idx } // namespace torch_ext +TRTLLM_NAMESPACE_END + TORCH_LIBRARY_FRAGMENT(trtllm, m) { m.def( @@ -503,12 +507,12 @@ TORCH_LIBRARY_FRAGMENT(trtllm, m) TORCH_LIBRARY_IMPL(trtllm, CUDA, m) { - m.impl("moe_topk_sort", &torch_ext::moe_topk_sort); - m.impl("moe_sort", &torch_ext::moe_sort); - m.impl("moe_permute", &torch_ext::moe_permute); - m.impl("moe_unpermute", &torch_ext::moe_unpermute); - m.impl("moe_output_memset_inplace", &torch_ext::moe_output_memset_inplace); - m.impl("moe_swiglu", &torch_ext::moe_swiglu); - m.impl("moe_swiglu_nvfp4_quantize", &torch_ext::moe_swiglu_nvfp4_quantize); - m.impl("moe_gelu", &torch_ext::moe_gelu); + m.impl("moe_topk_sort", &tensorrt_llm::torch_ext::moe_topk_sort); + m.impl("moe_sort", &tensorrt_llm::torch_ext::moe_sort); + m.impl("moe_permute", &tensorrt_llm::torch_ext::moe_permute); + m.impl("moe_unpermute", &tensorrt_llm::torch_ext::moe_unpermute); + m.impl("moe_output_memset_inplace", &tensorrt_llm::torch_ext::moe_output_memset_inplace); + m.impl("moe_swiglu", &tensorrt_llm::torch_ext::moe_swiglu); + m.impl("moe_swiglu_nvfp4_quantize", &tensorrt_llm::torch_ext::moe_swiglu_nvfp4_quantize); + m.impl("moe_gelu", &tensorrt_llm::torch_ext::moe_gelu); } diff --git a/cpp/tensorrt_llm/thop/cutlassScaledMM.cpp b/cpp/tensorrt_llm/thop/cutlassScaledMM.cpp index c9b05bb3d5..b314cb4d16 100644 --- a/cpp/tensorrt_llm/thop/cutlassScaledMM.cpp +++ b/cpp/tensorrt_llm/thop/cutlassScaledMM.cpp @@ -35,6 +35,8 @@ using tensorrt_llm::kernels::internal_cutlass_kernels::CutlassLowLatencyFp8GemmR using tensorrt_llm::kernels::internal_cutlass_kernels::LowLatencyCutlassGemmConfig; using tensorrt_llm::kernels::internal_cutlass_kernels::KernelScheduleType; #endif +TRTLLM_NAMESPACE_BEGIN + namespace torch_ext { @@ -171,6 +173,8 @@ Tensor cutlass_scaled_mm(Tensor const& mat_a, Tensor const& mat_b, Tensor const& } // namespace torch_ext +TRTLLM_NAMESPACE_END + TORCH_LIBRARY_FRAGMENT(trtllm, m) { m.def( @@ -180,5 +184,5 @@ TORCH_LIBRARY_FRAGMENT(trtllm, m) TORCH_LIBRARY_IMPL(trtllm, CUDA, m) { - m.impl("cutlass_scaled_mm", &torch_ext::cutlass_scaled_mm); + m.impl("cutlass_scaled_mm", &tensorrt_llm::torch_ext::cutlass_scaled_mm); } diff --git a/cpp/tensorrt_llm/thop/dsv3FusedAGemmOp.cpp b/cpp/tensorrt_llm/thop/dsv3FusedAGemmOp.cpp index 9d8bb5de35..c16f16a680 100644 --- a/cpp/tensorrt_llm/thop/dsv3FusedAGemmOp.cpp +++ b/cpp/tensorrt_llm/thop/dsv3FusedAGemmOp.cpp @@ -24,6 +24,8 @@ namespace th = torch; namespace tl = tensorrt_llm; namespace tk = tensorrt_llm::kernels; +TRTLLM_NAMESPACE_BEGIN + namespace torch_ext { th::Tensor dsv3_fused_a_gemm_op(th::Tensor const& mat_a, th::Tensor const& mat_b, std::optional const& bias, @@ -85,6 +87,8 @@ th::Tensor dsv3_fused_a_gemm_op(th::Tensor const& mat_a, th::Tensor const& mat_b } // namespace torch_ext +TRTLLM_NAMESPACE_END + TORCH_LIBRARY_FRAGMENT(trtllm, m) { m.def("dsv3_fused_a_gemm_op(Tensor mat_a, Tensor mat_b, Tensor? bias, ScalarType? out_dtype) -> (Tensor out)"); @@ -92,5 +96,5 @@ TORCH_LIBRARY_FRAGMENT(trtllm, m) TORCH_LIBRARY_IMPL(trtllm, CUDA, m) { - m.impl("dsv3_fused_a_gemm_op", &torch_ext::dsv3_fused_a_gemm_op); + m.impl("dsv3_fused_a_gemm_op", &tensorrt_llm::torch_ext::dsv3_fused_a_gemm_op); } diff --git a/cpp/tensorrt_llm/thop/dsv3RopeOp.cpp b/cpp/tensorrt_llm/thop/dsv3RopeOp.cpp index 39657c71e7..ff28f2004f 100644 --- a/cpp/tensorrt_llm/thop/dsv3RopeOp.cpp +++ b/cpp/tensorrt_llm/thop/dsv3RopeOp.cpp @@ -38,6 +38,8 @@ namespace tk = tensorrt_llm::kernels; namespace tc = tensorrt_llm::common; namespace tr = tensorrt_llm::runtime; +TRTLLM_NAMESPACE_BEGIN + namespace torch_ext { @@ -308,6 +310,8 @@ void MLARopeGeneration(torch::Tensor fused_q, // [tokens, num_heads, (nope_dim + } // namespace torch_ext +TRTLLM_NAMESPACE_END + TORCH_LIBRARY_FRAGMENT(trtllm, m) { m.def( @@ -356,5 +360,5 @@ TORCH_LIBRARY_FRAGMENT(trtllm, m) TORCH_LIBRARY_IMPL(trtllm, CUDA, m) { - m.impl("mla_rope_generation", &torch_ext::MLARopeGeneration); + m.impl("mla_rope_generation", &tensorrt_llm::torch_ext::MLARopeGeneration); } diff --git a/cpp/tensorrt_llm/thop/dsv3RouterGemmOp.cpp b/cpp/tensorrt_llm/thop/dsv3RouterGemmOp.cpp index 89ead8cade..6764cbef64 100644 --- a/cpp/tensorrt_llm/thop/dsv3RouterGemmOp.cpp +++ b/cpp/tensorrt_llm/thop/dsv3RouterGemmOp.cpp @@ -24,6 +24,8 @@ namespace th = torch; namespace tl = tensorrt_llm; namespace tk = tensorrt_llm::kernels; +TRTLLM_NAMESPACE_BEGIN + namespace torch_ext { @@ -106,6 +108,8 @@ th::Tensor dsv3_router_gemm_op(th::Tensor const& mat_a, th::Tensor const& mat_b, } // end namespace torch_ext +TRTLLM_NAMESPACE_END + TORCH_LIBRARY_FRAGMENT(trtllm, m) { m.def("dsv3_router_gemm_op(Tensor mat_a, Tensor mat_b, Tensor? bias, ScalarType? out_dtype) -> (Tensor out)"); @@ -113,5 +117,5 @@ TORCH_LIBRARY_FRAGMENT(trtllm, m) TORCH_LIBRARY_IMPL(trtllm, CUDA, m) { - m.impl("dsv3_router_gemm_op", &torch_ext::dsv3_router_gemm_op); + m.impl("dsv3_router_gemm_op", &tensorrt_llm::torch_ext::dsv3_router_gemm_op); } diff --git a/cpp/tensorrt_llm/thop/dynamicDecodeOp.cpp b/cpp/tensorrt_llm/thop/dynamicDecodeOp.cpp index f9e0e76a46..8e9e817bbb 100644 --- a/cpp/tensorrt_llm/thop/dynamicDecodeOp.cpp +++ b/cpp/tensorrt_llm/thop/dynamicDecodeOp.cpp @@ -33,6 +33,8 @@ namespace tr = tensorrt_llm::runtime; namespace tl = tensorrt_llm::layers; namespace tk = tensorrt_llm::kernels; +TRTLLM_NAMESPACE_BEGIN + namespace torch_ext { @@ -452,8 +454,10 @@ th::Tensor DynamicDecodeOp::forward( } // namespace torch_ext +TRTLLM_NAMESPACE_END + static auto trtllmGptContextDecoderTHS - = torch::jit::class_("trtllm", "DynamicDecodeOp") + = torch::jit::class_("trtllm", "DynamicDecodeOp") .def(torch::jit::init()) - .def("setup", &torch_ext::DynamicDecodeOp::setup) - .def("forward", &torch_ext::DynamicDecodeOp::forward); + .def("setup", &tensorrt_llm::torch_ext::DynamicDecodeOp::setup) + .def("forward", &tensorrt_llm::torch_ext::DynamicDecodeOp::forward); diff --git a/cpp/tensorrt_llm/thop/dynamicDecodeOp.h b/cpp/tensorrt_llm/thop/dynamicDecodeOp.h index 533066cc2a..c8f4fa807d 100644 --- a/cpp/tensorrt_llm/thop/dynamicDecodeOp.h +++ b/cpp/tensorrt_llm/thop/dynamicDecodeOp.h @@ -14,6 +14,7 @@ * limitations under the License. */ +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaBf16Wrapper.h" #include "tensorrt_llm/layers/dynamicDecodeLayer.h" #include "tensorrt_llm/runtime/iTensor.h" @@ -21,6 +22,8 @@ namespace th = torch; +TRTLLM_NAMESPACE_BEGIN + namespace torch_ext { @@ -158,3 +161,5 @@ private: }; } // namespace torch_ext + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/thop/finegrained_mixed_dtype_gemm_thop.cpp b/cpp/tensorrt_llm/thop/finegrained_mixed_dtype_gemm_thop.cpp index f2255604e2..6d47a76021 100644 --- a/cpp/tensorrt_llm/thop/finegrained_mixed_dtype_gemm_thop.cpp +++ b/cpp/tensorrt_llm/thop/finegrained_mixed_dtype_gemm_thop.cpp @@ -41,6 +41,8 @@ #include #include +TRTLLM_NAMESPACE_BEGIN + namespace torch_ext { @@ -271,10 +273,12 @@ int64_t finegrainedMixedDtypeGemmRunner::getNumConfigs() const } // namespace torch_ext +TRTLLM_NAMESPACE_END + TORCH_LIBRARY_FRAGMENT(trtllm, m) { - m.class_("finegrainedMixedDtypeGemmRunner") + m.class_("finegrainedMixedDtypeGemmRunner") .def(torch::init()) - .def("run_gemm", &torch_ext::finegrainedMixedDtypeGemmRunner::runGemm) - .def("get_num_configs", &torch_ext::finegrainedMixedDtypeGemmRunner::getNumConfigs); + .def("run_gemm", &tensorrt_llm::torch_ext::finegrainedMixedDtypeGemmRunner::runGemm) + .def("get_num_configs", &tensorrt_llm::torch_ext::finegrainedMixedDtypeGemmRunner::getNumConfigs); } diff --git a/cpp/tensorrt_llm/thop/finegrained_mixed_dtype_gemm_thop.h b/cpp/tensorrt_llm/thop/finegrained_mixed_dtype_gemm_thop.h index 5bda7be3eb..e8a11d2bdc 100644 --- a/cpp/tensorrt_llm/thop/finegrained_mixed_dtype_gemm_thop.h +++ b/cpp/tensorrt_llm/thop/finegrained_mixed_dtype_gemm_thop.h @@ -18,9 +18,12 @@ #include "cutlass_extensions/gemm_configs.h" #include "cutlass_extensions/weight_only_quant_op.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm.h" #include +TRTLLM_NAMESPACE_BEGIN + namespace torch_ext { @@ -44,3 +47,5 @@ private: }; } // namespace torch_ext + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/thop/fmhaPackMaskOp.cpp b/cpp/tensorrt_llm/thop/fmhaPackMaskOp.cpp index 57d8f6609c..5fa8d8637e 100644 --- a/cpp/tensorrt_llm/thop/fmhaPackMaskOp.cpp +++ b/cpp/tensorrt_llm/thop/fmhaPackMaskOp.cpp @@ -14,10 +14,13 @@ * limitations under the License. */ +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/mathUtils.h" #include "tensorrt_llm/kernels/contextFusedMultiHeadAttention/fmhaPackedMask.h" #include "tensorrt_llm/thop/thUtils.h" +TRTLLM_NAMESPACE_BEGIN + namespace torch_ext { using torch::Tensor; @@ -177,12 +180,14 @@ Tensor pack_fmha_mask_by_input( } // namespace torch_ext +TRTLLM_NAMESPACE_END + //////////////////////////////////////////////////////////////////////////////////////////////////// // Utility methods. -static auto pack_fmha_mask_by_type - = torch::RegisterOperators("tensorrt_llm::pack_fmha_mask_by_type", &torch_ext::pack_fmha_mask_by_type); +static auto pack_fmha_mask_by_type = torch::RegisterOperators( + "tensorrt_llm::pack_fmha_mask_by_type", &tensorrt_llm::torch_ext::pack_fmha_mask_by_type); // Utility methods. -static auto pack_fmha_mask_by_input - = torch::RegisterOperators("tensorrt_llm::pack_fmha_mask_by_input", &torch_ext::pack_fmha_mask_by_input); +static auto pack_fmha_mask_by_input = torch::RegisterOperators( + "tensorrt_llm::pack_fmha_mask_by_input", &tensorrt_llm::torch_ext::pack_fmha_mask_by_input); diff --git a/cpp/tensorrt_llm/thop/fp4BatchedQuantize.cpp b/cpp/tensorrt_llm/thop/fp4BatchedQuantize.cpp index 01368ee384..9ecda1a884 100644 --- a/cpp/tensorrt_llm/thop/fp4BatchedQuantize.cpp +++ b/cpp/tensorrt_llm/thop/fp4BatchedQuantize.cpp @@ -24,6 +24,8 @@ #include +TRTLLM_NAMESPACE_BEGIN + namespace torch_ext { // self: [B, M, K], fp16/bf16/fp8_quantized @@ -99,6 +101,8 @@ std::tuple fp4_batched_quantize( } } // namespace torch_ext +TRTLLM_NAMESPACE_END + TORCH_LIBRARY_FRAGMENT(trtllm, m) { m.def( @@ -108,5 +112,5 @@ TORCH_LIBRARY_FRAGMENT(trtllm, m) TORCH_LIBRARY_IMPL(trtllm, CUDA, m) { - m.impl("fp4_batched_quantize", &torch_ext::fp4_batched_quantize); + m.impl("fp4_batched_quantize", &tensorrt_llm::torch_ext::fp4_batched_quantize); } diff --git a/cpp/tensorrt_llm/thop/fp4BlockScaleMoe.cpp b/cpp/tensorrt_llm/thop/fp4BlockScaleMoe.cpp index 700c1a7d5a..81746654a4 100644 --- a/cpp/tensorrt_llm/thop/fp4BlockScaleMoe.cpp +++ b/cpp/tensorrt_llm/thop/fp4BlockScaleMoe.cpp @@ -22,6 +22,8 @@ #include #include +TRTLLM_NAMESPACE_BEGIN + namespace torch_ext { namespace btg = batchedGemm::trtllm::gen; @@ -576,17 +578,20 @@ torch::Tensor shuffleMatrix(torch::Tensor matrix, torch::Tensor permuteIndices) } // namespace torch_ext +TRTLLM_NAMESPACE_END + TORCH_LIBRARY_FRAGMENT(trtllm, m) { - m.class_("FP4BlockScaleMoERunner") + m.class_("FP4BlockScaleMoERunner") .def(torch::init<>()) - .def("get_valid_configs", &torch_ext::FP4BlockScaleMoeRunner::getValidConfigs) - .def("run_moe", &torch_ext::FP4BlockScaleMoeRunner::run); - m.class_("FP8FP4BlockScaleMoERunner") + .def("get_valid_configs", &tensorrt_llm::torch_ext::FP4BlockScaleMoeRunner::getValidConfigs) + .def("run_moe", &tensorrt_llm::torch_ext::FP4BlockScaleMoeRunner::run); + m.class_("FP8FP4BlockScaleMoERunner") .def(torch::init()) - .def("get_valid_configs", &torch_ext::FP8FP4BlockScaleMoeRunner::getValidConfigs) - .def("run_moe", &torch_ext::FP8FP4BlockScaleMoeRunner::run); + .def("get_valid_configs", &tensorrt_llm::torch_ext::FP8FP4BlockScaleMoeRunner::getValidConfigs) + .def("run_moe", &tensorrt_llm::torch_ext::FP8FP4BlockScaleMoeRunner::run); } // Accepts both CPU and CUDA tensors -static auto shuffle_matrix = torch::RegisterOperators("trtllm::shuffle_matrix", &torch_ext::shuffleMatrix); +static auto shuffle_matrix + = torch::RegisterOperators("trtllm::shuffle_matrix", &tensorrt_llm::torch_ext::shuffleMatrix); diff --git a/cpp/tensorrt_llm/thop/fp4Gemm.cpp b/cpp/tensorrt_llm/thop/fp4Gemm.cpp index 2fa818bdee..9c33436dc0 100644 --- a/cpp/tensorrt_llm/thop/fp4Gemm.cpp +++ b/cpp/tensorrt_llm/thop/fp4Gemm.cpp @@ -47,6 +47,8 @@ using tensorrt_llm::kernels::internal_cutlass_kernels::CutlassFp4GemmRunner; using tensorrt_llm::kernels::internal_cutlass_kernels::CutlassFp4GemmRunnerInterface; #endif +TRTLLM_NAMESPACE_BEGIN + namespace torch_ext { @@ -310,12 +312,14 @@ private: }; } // namespace torch_ext +TRTLLM_NAMESPACE_END + TORCH_LIBRARY_FRAGMENT(trtllm, m) { - m.class_("FP4GemmRunner") + m.class_("FP4GemmRunner") .def(torch::init()) - .def("run_gemm", &torch_ext::FP4GemmRunner::runGemm) - .def("get_num_configs", &torch_ext::FP4GemmRunner::getNumConfigs); + .def("run_gemm", &tensorrt_llm::torch_ext::FP4GemmRunner::runGemm) + .def("get_num_configs", &tensorrt_llm::torch_ext::FP4GemmRunner::getNumConfigs); m.def( "fp4_bmm(Tensor mat1, Tensor mat2, Tensor mat1Scale, Tensor mat2Scale, Tensor globalScale, int fp4GemmType, " @@ -327,6 +331,6 @@ TORCH_LIBRARY_FRAGMENT(trtllm, m) TORCH_LIBRARY_IMPL(trtllm, CUDA, m) { - m.impl("fp4_bmm", &torch_ext::fp4_bmm); - m.impl("fp4_gemm", &torch_ext::fp4_bmm); + m.impl("fp4_bmm", &tensorrt_llm::torch_ext::fp4_bmm); + m.impl("fp4_gemm", &tensorrt_llm::torch_ext::fp4_bmm); } diff --git a/cpp/tensorrt_llm/thop/fp4GemmTrtllmGen.cpp b/cpp/tensorrt_llm/thop/fp4GemmTrtllmGen.cpp index 6b923336d1..1c9ac017fb 100644 --- a/cpp/tensorrt_llm/thop/fp4GemmTrtllmGen.cpp +++ b/cpp/tensorrt_llm/thop/fp4GemmTrtllmGen.cpp @@ -25,6 +25,8 @@ #include +TRTLLM_NAMESPACE_BEGIN + namespace torch_ext { @@ -126,6 +128,8 @@ at::Tensor fp4_gemm_trtllmgen(at::Tensor const& mat1, at::Tensor const& mat2, at } // namespace torch_ext +TRTLLM_NAMESPACE_END + TORCH_LIBRARY_FRAGMENT(trtllm, m) { m.def( @@ -136,5 +140,5 @@ TORCH_LIBRARY_FRAGMENT(trtllm, m) TORCH_LIBRARY_IMPL(trtllm, CUDA, m) { - m.impl("fp4_gemm_trtllmgen", &torch_ext::fp4_gemm_trtllmgen); + m.impl("fp4_gemm_trtllmgen", &tensorrt_llm::torch_ext::fp4_gemm_trtllmgen); } diff --git a/cpp/tensorrt_llm/thop/fp4Op.cpp b/cpp/tensorrt_llm/thop/fp4Op.cpp index 54746be1c7..abaf242858 100644 --- a/cpp/tensorrt_llm/thop/fp4Op.cpp +++ b/cpp/tensorrt_llm/thop/fp4Op.cpp @@ -27,6 +27,8 @@ namespace th = torch; +TRTLLM_NAMESPACE_BEGIN + namespace torch_ext { @@ -476,17 +478,19 @@ th::Tensor E2M1AndUFP8SFScaleToFloatV2(th::Tensor valueE2M1, th::Tensor scaleFP8 } // namespace torch_ext -static auto float_to_e2m1_and_ufp8sf_scale - = torch::RegisterOperators("tensorrt_llm::float_to_e2m1_and_ufp8sf_scale", &torch_ext::FloatToE2M1AndUFP8SFScale); +TRTLLM_NAMESPACE_END -static auto half_to_e2m1_and_ufp8sf_scale - = torch::RegisterOperators("tensorrt_llm::half_to_e2m1_and_ufp8sf_scale", &torch_ext::HalfToE2M1AndUFP8SFScale); +static auto float_to_e2m1_and_ufp8sf_scale = torch::RegisterOperators( + "tensorrt_llm::float_to_e2m1_and_ufp8sf_scale", &tensorrt_llm::torch_ext::FloatToE2M1AndUFP8SFScale); -static auto e2m1_and_ufp8sf_scale_to_float - = torch::RegisterOperators("tensorrt_llm::e2m1_and_ufp8sf_scale_to_float", &torch_ext::E2M1AndUFP8SFScaleToFloat); +static auto half_to_e2m1_and_ufp8sf_scale = torch::RegisterOperators( + "tensorrt_llm::half_to_e2m1_and_ufp8sf_scale", &tensorrt_llm::torch_ext::HalfToE2M1AndUFP8SFScale); + +static auto e2m1_and_ufp8sf_scale_to_float = torch::RegisterOperators( + "tensorrt_llm::e2m1_and_ufp8sf_scale_to_float", &tensorrt_llm::torch_ext::E2M1AndUFP8SFScaleToFloat); static auto e2m1_and_ufp8sf_scale_to_float_v2 = torch::RegisterOperators( - "tensorrt_llm::e2m1_and_ufp8sf_scale_to_float_v2", &torch_ext::E2M1AndUFP8SFScaleToFloatV2); + "tensorrt_llm::e2m1_and_ufp8sf_scale_to_float_v2", &tensorrt_llm::torch_ext::E2M1AndUFP8SFScaleToFloatV2); TORCH_LIBRARY_FRAGMENT(trtllm, m) { @@ -496,12 +500,12 @@ TORCH_LIBRARY_FRAGMENT(trtllm, m) TORCH_LIBRARY_IMPL(trtllm, CUDA, m) { - m.impl("block_scale_interleave", &torch_ext::BlockScaleInterleave); - m.impl("block_scale_interleave_reverse", &torch_ext::BlockScaleInterleaveReverse); + m.impl("block_scale_interleave", &tensorrt_llm::torch_ext::BlockScaleInterleave); + m.impl("block_scale_interleave_reverse", &tensorrt_llm::torch_ext::BlockScaleInterleaveReverse); } TORCH_LIBRARY_IMPL(trtllm, CPU, m) { - m.impl("block_scale_interleave", &torch_ext::BlockScaleInterleave); - m.impl("block_scale_interleave_reverse", &torch_ext::BlockScaleInterleaveReverse); + m.impl("block_scale_interleave", &tensorrt_llm::torch_ext::BlockScaleInterleave); + m.impl("block_scale_interleave_reverse", &tensorrt_llm::torch_ext::BlockScaleInterleaveReverse); } diff --git a/cpp/tensorrt_llm/thop/fp4Quantize.cpp b/cpp/tensorrt_llm/thop/fp4Quantize.cpp index a4d9b038bf..61745850c8 100644 --- a/cpp/tensorrt_llm/thop/fp4Quantize.cpp +++ b/cpp/tensorrt_llm/thop/fp4Quantize.cpp @@ -26,6 +26,8 @@ #include #include +TRTLLM_NAMESPACE_BEGIN + namespace torch_ext { // self: [M, K], fp16/bf16/fp8_quantized @@ -232,6 +234,8 @@ at::Tensor calculate_nvfp4_global_scale(at::Tensor const& input, std::optional #include +TRTLLM_NAMESPACE_BEGIN + namespace torch_ext { std::tuple fp4_quantize(at::Tensor const& self, std::optional const& globalScale, @@ -29,3 +33,5 @@ std::tuple fp4_quantize(at::Tensor const& self, std::opt at::Tensor calculate_nvfp4_global_scale(at::Tensor const& input, std::optional const& tokensPerBatch); } // namespace torch_ext + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/thop/fp4xFp8GemmTrtllmGen.cpp b/cpp/tensorrt_llm/thop/fp4xFp8GemmTrtllmGen.cpp index 8ed81c4aa9..b657b92eb3 100644 --- a/cpp/tensorrt_llm/thop/fp4xFp8GemmTrtllmGen.cpp +++ b/cpp/tensorrt_llm/thop/fp4xFp8GemmTrtllmGen.cpp @@ -25,6 +25,8 @@ #include +TRTLLM_NAMESPACE_BEGIN + namespace torch_ext { @@ -113,6 +115,8 @@ at::Tensor fp4_fp8_gemm_trtllmgen(at::Tensor const& mat1, at::Tensor const& mat2 } // namespace torch_ext +TRTLLM_NAMESPACE_END + TORCH_LIBRARY_FRAGMENT(trtllm, m) { m.def( @@ -122,5 +126,5 @@ TORCH_LIBRARY_FRAGMENT(trtllm, m) TORCH_LIBRARY_IMPL(trtllm, CUDA, m) { - m.impl("fp4_fp8_gemm_trtllmgen", &torch_ext::fp4_fp8_gemm_trtllmgen); + m.impl("fp4_fp8_gemm_trtllmgen", &tensorrt_llm::torch_ext::fp4_fp8_gemm_trtllmgen); } diff --git a/cpp/tensorrt_llm/thop/fp8BatchedGemmTrtllmGen.cpp b/cpp/tensorrt_llm/thop/fp8BatchedGemmTrtllmGen.cpp index be1970e480..f3da650a94 100644 --- a/cpp/tensorrt_llm/thop/fp8BatchedGemmTrtllmGen.cpp +++ b/cpp/tensorrt_llm/thop/fp8BatchedGemmTrtllmGen.cpp @@ -173,6 +173,8 @@ std::tuple fp8_batched_gemm_sm100(at::Tensor const& mat1 } } // namespace +TRTLLM_NAMESPACE_BEGIN + namespace torch_ext { @@ -268,10 +270,12 @@ private: } // namespace torch_ext +TRTLLM_NAMESPACE_END + TORCH_LIBRARY_FRAGMENT(trtllm, m) { - m.class_("FP8BatchedGemmRunner") + m.class_("FP8BatchedGemmRunner") .def(torch::init()) - .def("get_valid_configs", &torch_ext::FP8BatchedGemmRunner::getValidConfigs) - .def("run_batched_gemm", &torch_ext::FP8BatchedGemmRunner::runBatchedGemm); + .def("get_valid_configs", &tensorrt_llm::torch_ext::FP8BatchedGemmRunner::getValidConfigs) + .def("run_batched_gemm", &tensorrt_llm::torch_ext::FP8BatchedGemmRunner::runBatchedGemm); } diff --git a/cpp/tensorrt_llm/thop/fp8BlockScaleMoe.cpp b/cpp/tensorrt_llm/thop/fp8BlockScaleMoe.cpp index 42e55dc00c..b8e688d1d3 100644 --- a/cpp/tensorrt_llm/thop/fp8BlockScaleMoe.cpp +++ b/cpp/tensorrt_llm/thop/fp8BlockScaleMoe.cpp @@ -26,6 +26,8 @@ #include #include +TRTLLM_NAMESPACE_BEGIN + namespace torch_ext { @@ -395,10 +397,12 @@ private: } // namespace torch_ext +TRTLLM_NAMESPACE_END + TORCH_LIBRARY_FRAGMENT(trtllm, m) { - m.class_("FP8BlockScaleMoERunner") + m.class_("FP8BlockScaleMoERunner") .def(torch::init<>()) - .def("get_valid_configs", &torch_ext::FP8BlockScaleMoeRunner::getValidConfigs) - .def("run_moe", &torch_ext::FP8BlockScaleMoeRunner::run); + .def("get_valid_configs", &tensorrt_llm::torch_ext::FP8BlockScaleMoeRunner::getValidConfigs) + .def("run_moe", &tensorrt_llm::torch_ext::FP8BlockScaleMoeRunner::run); } diff --git a/cpp/tensorrt_llm/thop/fp8BlockScalingGemm.cpp b/cpp/tensorrt_llm/thop/fp8BlockScalingGemm.cpp index cdea9d03fa..d6e65a2941 100644 --- a/cpp/tensorrt_llm/thop/fp8BlockScalingGemm.cpp +++ b/cpp/tensorrt_llm/thop/fp8BlockScalingGemm.cpp @@ -13,6 +13,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaUtils.h" #include "tensorrt_llm/kernels/cutlass_kernels/fp8_blockscale_gemm/fp8_blockscale_gemm.h" #include "tensorrt_llm/kernels/trtllmGenKernels/gemm/KernelRunner.h" @@ -26,6 +27,8 @@ using namespace tensorrt_llm::kernels::fp8_blockscale_gemm; using namespace tensorrt_llm::kernels; +TRTLLM_NAMESPACE_BEGIN + namespace torch_ext { @@ -382,6 +385,8 @@ torch::Tensor fp8_block_scaling_bmm(torch::Tensor const& mat1, torch::Tensor con } // namespace torch_ext +TRTLLM_NAMESPACE_END + TORCH_LIBRARY_FRAGMENT(trtllm, m) { m.def("fp8_block_scaling_gemm(Tensor mat1, Tensor mat2, Tensor mat1Scale, Tensor mat2Scale) -> Tensor"); @@ -398,8 +403,8 @@ TORCH_LIBRARY_FRAGMENT(trtllm, m) TORCH_LIBRARY_IMPL(trtllm, CUDA, m) { - m.impl("fp8_block_scaling_gemm", &torch_ext::fp8_block_scaling_gemm); - m.impl("fp8_block_scaling_bmm", &torch_ext::fp8_block_scaling_bmm); - m.impl("fp8_block_scaling_bmm_out", &torch_ext::fp8_block_scaling_bmm_out); - m.impl("fp8_block_scaling_moe_gemm", &torch_ext::fp8_block_scaling_moe_gemm); + m.impl("fp8_block_scaling_gemm", &tensorrt_llm::torch_ext::fp8_block_scaling_gemm); + m.impl("fp8_block_scaling_bmm", &tensorrt_llm::torch_ext::fp8_block_scaling_bmm); + m.impl("fp8_block_scaling_bmm_out", &tensorrt_llm::torch_ext::fp8_block_scaling_bmm_out); + m.impl("fp8_block_scaling_moe_gemm", &tensorrt_llm::torch_ext::fp8_block_scaling_moe_gemm); } diff --git a/cpp/tensorrt_llm/thop/fp8Op.cpp b/cpp/tensorrt_llm/thop/fp8Op.cpp index 21f56757c6..867fd3de0c 100644 --- a/cpp/tensorrt_llm/thop/fp8Op.cpp +++ b/cpp/tensorrt_llm/thop/fp8Op.cpp @@ -16,6 +16,7 @@ #include "tensorrt_llm/thop/fp8Op.h" #include "cutlass/numeric_types.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaBf16Wrapper.h" #include "tensorrt_llm/common/cudaFp8Utils.h" #include "tensorrt_llm/thop/thUtils.h" @@ -26,6 +27,8 @@ #define TORCH_IS_AT_LEAST_v190 #endif +TRTLLM_NAMESPACE_BEGIN + namespace torch_ext { using torch::Tensor; @@ -370,6 +373,8 @@ Tensor symmetric_dequantize_per_tensor(Tensor input, Tensor scales) } // namespace torch_ext +TRTLLM_NAMESPACE_END + // Utility methods that may be useful for preprocessing weights in torch. TORCH_LIBRARY_FRAGMENT(tensorrt_llm, m) { @@ -386,19 +391,19 @@ TORCH_LIBRARY_FRAGMENT(tensorrt_llm, m) TORCH_LIBRARY_IMPL(tensorrt_llm, CUDA, m) { - m.impl("quantize_e4m3_weight", &torch_ext::symmetric_quantize_weight); - m.impl("quantize_e4m3_activation", &torch_ext::symmetric_quantize_activation); - m.impl("quantize_e4m3_per_tensor", &torch_ext::symmetric_quantize_per_tensor); - m.impl("static_quantize_e4m3_weight", &torch_ext::symmetric_static_quantize_weight); - m.impl("static_quantize_e4m3_activation", &torch_ext::symmetric_static_quantize_activation); - m.impl("static_quantize_e4m3_per_tensor", &torch_ext::symmetric_static_quantize_per_tensor); - m.impl("dequantize_e4m3_weight", &torch_ext::symmetric_dequantize_weight); - m.impl("dequantize_e4m3_activation", &torch_ext::symmetric_dequantize_activation); - m.impl("dequantize_e4m3_per_tensor", &torch_ext::symmetric_dequantize_per_tensor); + m.impl("quantize_e4m3_weight", &tensorrt_llm::torch_ext::symmetric_quantize_weight); + m.impl("quantize_e4m3_activation", &tensorrt_llm::torch_ext::symmetric_quantize_activation); + m.impl("quantize_e4m3_per_tensor", &tensorrt_llm::torch_ext::symmetric_quantize_per_tensor); + m.impl("static_quantize_e4m3_weight", &tensorrt_llm::torch_ext::symmetric_static_quantize_weight); + m.impl("static_quantize_e4m3_activation", &tensorrt_llm::torch_ext::symmetric_static_quantize_activation); + m.impl("static_quantize_e4m3_per_tensor", &tensorrt_llm::torch_ext::symmetric_static_quantize_per_tensor); + m.impl("dequantize_e4m3_weight", &tensorrt_llm::torch_ext::symmetric_dequantize_weight); + m.impl("dequantize_e4m3_activation", &tensorrt_llm::torch_ext::symmetric_dequantize_activation); + m.impl("dequantize_e4m3_per_tensor", &tensorrt_llm::torch_ext::symmetric_dequantize_per_tensor); } -static auto dequantize_mxe4m3_host - = torch::RegisterOperators("tensorrt_llm::dequantize_mxe4m3_host", &torch_ext::dequantize_mxe4m3_host); +static auto dequantize_mxe4m3_host = torch::RegisterOperators( + "tensorrt_llm::dequantize_mxe4m3_host", &tensorrt_llm::torch_ext::dequantize_mxe4m3_host); static auto quantize_mxe4m3_host - = torch::RegisterOperators("tensorrt_llm::quantize_mxe4m3_host", &torch_ext::quantize_mxe4m3_host); + = torch::RegisterOperators("tensorrt_llm::quantize_mxe4m3_host", &tensorrt_llm::torch_ext::quantize_mxe4m3_host); diff --git a/cpp/tensorrt_llm/thop/fp8Op.h b/cpp/tensorrt_llm/thop/fp8Op.h index 1b08935d1d..1a9955c4d5 100644 --- a/cpp/tensorrt_llm/thop/fp8Op.h +++ b/cpp/tensorrt_llm/thop/fp8Op.h @@ -16,6 +16,7 @@ #pragma once +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaUtils.h" #include "tensorrt_llm/kernels/quantization.h" #include "tensorrt_llm/thop/thUtils.h" @@ -26,6 +27,8 @@ #include +TRTLLM_NAMESPACE_BEGIN + namespace torch_ext { // Given the rowIdx and colIdx in the unswizzled SFMatrix, compute the 1D offset in the swizzled SFMatrix. @@ -83,3 +86,5 @@ torch::Tensor symmetric_dequantize_activation(torch::Tensor activation, torch::T torch::Tensor symmetric_dequantize_per_tensor(torch::Tensor input, torch::Tensor scales); } // namespace torch_ext + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/thop/fp8PerTensorScaleMoe.cpp b/cpp/tensorrt_llm/thop/fp8PerTensorScaleMoe.cpp index a1794d6c2f..9681be6e7a 100644 --- a/cpp/tensorrt_llm/thop/fp8PerTensorScaleMoe.cpp +++ b/cpp/tensorrt_llm/thop/fp8PerTensorScaleMoe.cpp @@ -19,6 +19,8 @@ #include "tensorrt_llm/thop/thUtils.h" #include +TRTLLM_NAMESPACE_BEGIN + namespace torch_ext { @@ -310,6 +312,8 @@ torch::Tensor fp8_per_tensor_scale_moe_runner(torch::optional con } } // namespace torch_ext +TRTLLM_NAMESPACE_END + TORCH_LIBRARY_FRAGMENT(trtllm, m) { m.def( @@ -339,5 +343,5 @@ TORCH_LIBRARY_FRAGMENT(trtllm, m) TORCH_LIBRARY_IMPL(trtllm, CUDA, m) { - m.impl("fp8_per_tensor_scale_moe_runner", &torch_ext::fp8_per_tensor_scale_moe_runner); + m.impl("fp8_per_tensor_scale_moe_runner", &tensorrt_llm::torch_ext::fp8_per_tensor_scale_moe_runner); } diff --git a/cpp/tensorrt_llm/thop/fp8PerTensorScalingTrtllmGenGemm.cpp b/cpp/tensorrt_llm/thop/fp8PerTensorScalingTrtllmGenGemm.cpp index 5c66eaf4f6..7f044a198e 100644 --- a/cpp/tensorrt_llm/thop/fp8PerTensorScalingTrtllmGenGemm.cpp +++ b/cpp/tensorrt_llm/thop/fp8PerTensorScalingTrtllmGenGemm.cpp @@ -25,6 +25,8 @@ #include +TRTLLM_NAMESPACE_BEGIN + namespace torch_ext { @@ -161,6 +163,8 @@ torch::Tensor fp8_per_tensor_scaling_tllmg_gemm(torch::Tensor const& mat1, torch } // namespace torch_ext +TRTLLM_NAMESPACE_END + TORCH_LIBRARY_FRAGMENT(trtllm, m) { m.def( @@ -170,5 +174,5 @@ TORCH_LIBRARY_FRAGMENT(trtllm, m) TORCH_LIBRARY_IMPL(trtllm, CUDA, m) { - m.impl("fp8_per_tensor_scaling_tllmg_gemm", &torch_ext::fp8_per_tensor_scaling_tllmg_gemm); + m.impl("fp8_per_tensor_scaling_tllmg_gemm", &tensorrt_llm::torch_ext::fp8_per_tensor_scaling_tllmg_gemm); } diff --git a/cpp/tensorrt_llm/thop/fp8Quantize.cpp b/cpp/tensorrt_llm/thop/fp8Quantize.cpp index 7b0f86c47b..91746a321b 100644 --- a/cpp/tensorrt_llm/thop/fp8Quantize.cpp +++ b/cpp/tensorrt_llm/thop/fp8Quantize.cpp @@ -20,6 +20,8 @@ #include +TRTLLM_NAMESPACE_BEGIN + namespace torch_ext { @@ -135,6 +137,8 @@ std::tuple fp8_batched_quantize_1x128_permute102(at::Ten } } // namespace torch_ext +TRTLLM_NAMESPACE_END + TORCH_LIBRARY_FRAGMENT(trtllm, m) { m.def("fp8_quantize_1x128(Tensor input, bool use_ue8m0=False) -> (Tensor, Tensor)"); @@ -143,6 +147,6 @@ TORCH_LIBRARY_FRAGMENT(trtllm, m) TORCH_LIBRARY_IMPL(trtllm, CUDA, m) { - m.impl("fp8_quantize_1x128", &torch_ext::fp8_quantize_1x128); - m.impl("fp8_batched_quantize_1x128_permute102", &torch_ext::fp8_batched_quantize_1x128_permute102); + m.impl("fp8_quantize_1x128", &tensorrt_llm::torch_ext::fp8_quantize_1x128); + m.impl("fp8_batched_quantize_1x128_permute102", &tensorrt_llm::torch_ext::fp8_batched_quantize_1x128_permute102); } diff --git a/cpp/tensorrt_llm/thop/fp8RowwiseGemm.cpp b/cpp/tensorrt_llm/thop/fp8RowwiseGemm.cpp index 97a05a568c..a90795badf 100644 --- a/cpp/tensorrt_llm/thop/fp8RowwiseGemm.cpp +++ b/cpp/tensorrt_llm/thop/fp8RowwiseGemm.cpp @@ -34,6 +34,8 @@ using tensorrt_llm::kernels::cutlass_kernels::CutlassFp8RowwiseGemmRunner; using tensorrt_llm::kernels::cutlass_kernels::CutlassFp8RowwiseGemmRunnerInterface; +TRTLLM_NAMESPACE_BEGIN + namespace torch_ext { @@ -183,10 +185,12 @@ private: }; } // namespace torch_ext +TRTLLM_NAMESPACE_END + TORCH_LIBRARY_FRAGMENT(trtllm, m) { - m.class_("FP8RowwiseGemmRunner") + m.class_("FP8RowwiseGemmRunner") .def(torch::init()) - .def("run_gemm", &torch_ext::FP8RowwiseGemmRunner::runGemm) - .def("get_num_configs", &torch_ext::FP8RowwiseGemmRunner::getNumConfigs); + .def("run_gemm", &tensorrt_llm::torch_ext::FP8RowwiseGemmRunner::runGemm) + .def("get_num_configs", &tensorrt_llm::torch_ext::FP8RowwiseGemmRunner::getNumConfigs); } diff --git a/cpp/tensorrt_llm/thop/fusedQKNormRopeOp.cpp b/cpp/tensorrt_llm/thop/fusedQKNormRopeOp.cpp index 20225ab71c..14bf8578dc 100644 --- a/cpp/tensorrt_llm/thop/fusedQKNormRopeOp.cpp +++ b/cpp/tensorrt_llm/thop/fusedQKNormRopeOp.cpp @@ -20,6 +20,8 @@ #include #include +TRTLLM_NAMESPACE_BEGIN + namespace torch_ext { @@ -94,3 +96,5 @@ TORCH_LIBRARY_IMPL(trtllm, CUDA, m) } } // namespace torch_ext + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/thop/fusedTopkSoftmax.cpp b/cpp/tensorrt_llm/thop/fusedTopkSoftmax.cpp index 6b6e0edc7c..0974b30f43 100644 --- a/cpp/tensorrt_llm/thop/fusedTopkSoftmax.cpp +++ b/cpp/tensorrt_llm/thop/fusedTopkSoftmax.cpp @@ -25,6 +25,8 @@ #include +TRTLLM_NAMESPACE_BEGIN + namespace torch_ext { @@ -56,6 +58,8 @@ std::tuple fused_topk_softmax(torch::Tensor const& } } // namespace torch_ext +TRTLLM_NAMESPACE_END + TORCH_LIBRARY_FRAGMENT(trtllm, m) { m.def( @@ -66,5 +70,5 @@ TORCH_LIBRARY_FRAGMENT(trtllm, m) TORCH_LIBRARY_IMPL(trtllm, CUDA, m) { - m.impl("fused_topk_softmax", &torch_ext::fused_topk_softmax); + m.impl("fused_topk_softmax", &tensorrt_llm::torch_ext::fused_topk_softmax); } diff --git a/cpp/tensorrt_llm/thop/gatherTreeOp.cpp b/cpp/tensorrt_llm/thop/gatherTreeOp.cpp index e951830768..45f2649a6a 100644 --- a/cpp/tensorrt_llm/thop/gatherTreeOp.cpp +++ b/cpp/tensorrt_llm/thop/gatherTreeOp.cpp @@ -24,6 +24,8 @@ namespace th = torch; namespace tl = tensorrt_llm; namespace tk = tensorrt_llm::kernels; +TRTLLM_NAMESPACE_BEGIN + namespace torch_ext { @@ -135,4 +137,6 @@ th::Tensor gatherTree( // BS: batch_size, BM: } // namespace torch_ext -static auto gather_tree = torch::RegisterOperators("tensorrt_llm::gather_tree", &torch_ext::gatherTree); +TRTLLM_NAMESPACE_END + +static auto gather_tree = torch::RegisterOperators("tensorrt_llm::gather_tree", &tensorrt_llm::torch_ext::gatherTree); diff --git a/cpp/tensorrt_llm/thop/groupRmsNormOp.cpp b/cpp/tensorrt_llm/thop/groupRmsNormOp.cpp index 4cdffe6363..c408a8c286 100644 --- a/cpp/tensorrt_llm/thop/groupRmsNormOp.cpp +++ b/cpp/tensorrt_llm/thop/groupRmsNormOp.cpp @@ -28,6 +28,8 @@ #include #include +TRTLLM_NAMESPACE_BEGIN + namespace torch_ext { @@ -280,10 +282,12 @@ void groupRMSNormHeuristic(torch::TensorList const& inputs, torch::TensorList co } // namespace torch_ext +TRTLLM_NAMESPACE_END + TORCH_LIBRARY_IMPL(trtllm, CUDA, m) { - m.impl("group_rms_norm_base", &torch_ext::groupRMSNormBase); - m.impl("group_rms_norm_large_batch", &torch_ext::groupRMSNormLargeBatch); + m.impl("group_rms_norm_base", &tensorrt_llm::torch_ext::groupRMSNormBase); + m.impl("group_rms_norm_large_batch", &tensorrt_llm::torch_ext::groupRMSNormLargeBatch); // Use groupRMSNormHeuristic which automatically selects between regular and large batch kernels - m.impl("group_rms_norm_heuristic", &torch_ext::groupRMSNormHeuristic); + m.impl("group_rms_norm_heuristic", &tensorrt_llm::torch_ext::groupRMSNormHeuristic); } diff --git a/cpp/tensorrt_llm/thop/helixPostProcessOp.cpp b/cpp/tensorrt_llm/thop/helixPostProcessOp.cpp index 90a70c5edf..f8425cbade 100644 --- a/cpp/tensorrt_llm/thop/helixPostProcessOp.cpp +++ b/cpp/tensorrt_llm/thop/helixPostProcessOp.cpp @@ -21,6 +21,8 @@ #include #include +TRTLLM_NAMESPACE_BEGIN + namespace torch_ext { @@ -108,3 +110,5 @@ TORCH_LIBRARY_IMPL(trtllm, CUDA, m) } } // namespace torch_ext + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/thop/llama4MinLatency.cpp b/cpp/tensorrt_llm/thop/llama4MinLatency.cpp index 53873e3d27..6737ca0dfd 100644 --- a/cpp/tensorrt_llm/thop/llama4MinLatency.cpp +++ b/cpp/tensorrt_llm/thop/llama4MinLatency.cpp @@ -33,6 +33,8 @@ #include #include +TRTLLM_NAMESPACE_BEGIN + namespace torch_ext { @@ -210,10 +212,12 @@ TORCH_LIBRARY_FRAGMENT(trtllm, m) TORCH_LIBRARY_IMPL(trtllm, CUDA, m) { - m.impl("llama4_bf16_bf16_gemm", &torch_ext::llama4_bf16_bf16_gemm); - m.impl("llama4_fp8_bf16_gemm", &torch_ext::llama4_fp8_bf16_gemm); - m.impl("llama4_fp8_fp8_gemm_swiglu", &torch_ext::llama4_fp8_fp8_gemm_swiglu); - m.impl("llama4_moe_tp8ep1_min_latency", &torch_ext::llama4_moe_tp8ep1_min_latency); + m.impl("llama4_bf16_bf16_gemm", &tensorrt_llm::torch_ext::llama4_bf16_bf16_gemm); + m.impl("llama4_fp8_bf16_gemm", &tensorrt_llm::torch_ext::llama4_fp8_bf16_gemm); + m.impl("llama4_fp8_fp8_gemm_swiglu", &tensorrt_llm::torch_ext::llama4_fp8_fp8_gemm_swiglu); + m.impl("llama4_moe_tp8ep1_min_latency", &tensorrt_llm::torch_ext::llama4_moe_tp8ep1_min_latency); } } // namespace torch_ext + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/thop/logitsBitmaskOp.cpp b/cpp/tensorrt_llm/thop/logitsBitmaskOp.cpp index 0a3fa76ff6..2f6eddd5ca 100644 --- a/cpp/tensorrt_llm/thop/logitsBitmaskOp.cpp +++ b/cpp/tensorrt_llm/thop/logitsBitmaskOp.cpp @@ -18,6 +18,8 @@ #include "tensorrt_llm/kernels/logitsBitmask.h" #include "tensorrt_llm/thop/thUtils.h" +TRTLLM_NAMESPACE_BEGIN + namespace torch_ext { @@ -95,6 +97,8 @@ void logitsBitmask(torch::Tensor const& logits, torch::Tensor const& bitmask, } // namespace torch_ext +TRTLLM_NAMESPACE_END + TORCH_LIBRARY_FRAGMENT(trtllm, m) { m.def("logits_bitmask(Tensor(a!) logits, Tensor bitmask, Tensor? token_mask=None, Tensor? d2t=None) -> ()"); @@ -102,5 +106,5 @@ TORCH_LIBRARY_FRAGMENT(trtllm, m) TORCH_LIBRARY_IMPL(trtllm, CUDA, m) { - m.impl("logits_bitmask", &torch_ext::logitsBitmask); + m.impl("logits_bitmask", &tensorrt_llm::torch_ext::logitsBitmask); } diff --git a/cpp/tensorrt_llm/thop/loraOp.cpp b/cpp/tensorrt_llm/thop/loraOp.cpp index 379e7cf43c..08cf10decf 100644 --- a/cpp/tensorrt_llm/thop/loraOp.cpp +++ b/cpp/tensorrt_llm/thop/loraOp.cpp @@ -26,6 +26,8 @@ namespace th = torch; namespace tk = tensorrt_llm::kernels; using tensorrt_llm::common::fmtstr; +TRTLLM_NAMESPACE_BEGIN + namespace torch_ext { @@ -174,6 +176,8 @@ std::vector lora_grouped_gemm(th::Tensor const& input, th::Tensor co } // namespace torch_ext +TRTLLM_NAMESPACE_END + TORCH_LIBRARY_FRAGMENT(trtllm, m) { m.def( @@ -192,5 +196,5 @@ TORCH_LIBRARY_FRAGMENT(trtllm, m) TORCH_LIBRARY_IMPL(trtllm, CUDA, m) { - m.impl("lora_grouped_gemm", &torch_ext::lora_grouped_gemm); + m.impl("lora_grouped_gemm", &tensorrt_llm::torch_ext::lora_grouped_gemm); } diff --git a/cpp/tensorrt_llm/thop/mambaConv1dOp.cpp b/cpp/tensorrt_llm/thop/mambaConv1dOp.cpp index f1933ae3cd..81f5a9ac8b 100644 --- a/cpp/tensorrt_llm/thop/mambaConv1dOp.cpp +++ b/cpp/tensorrt_llm/thop/mambaConv1dOp.cpp @@ -21,6 +21,8 @@ namespace th = torch; namespace tk = tensorrt_llm::kernels; +TRTLLM_NAMESPACE_BEGIN + namespace torch_ext { @@ -173,6 +175,8 @@ std::tuple mamba_conv1d(th::Tensor const& input, th::Ten } // namespace torch_ext +TRTLLM_NAMESPACE_END + TORCH_LIBRARY_FRAGMENT(trtllm, m) { m.def( @@ -187,5 +191,5 @@ TORCH_LIBRARY_FRAGMENT(trtllm, m) TORCH_LIBRARY_IMPL(trtllm, CUDA, m) { - m.impl("mamba_conv1d", &torch_ext::mamba_conv1d); + m.impl("mamba_conv1d", &tensorrt_llm::torch_ext::mamba_conv1d); } diff --git a/cpp/tensorrt_llm/thop/mlaPreprocessOp.cpp b/cpp/tensorrt_llm/thop/mlaPreprocessOp.cpp index 6dfffec54d..171f0d1522 100644 --- a/cpp/tensorrt_llm/thop/mlaPreprocessOp.cpp +++ b/cpp/tensorrt_llm/thop/mlaPreprocessOp.cpp @@ -28,6 +28,8 @@ namespace tk = tensorrt_llm::kernels; namespace tc = tensorrt_llm::common; using tk::KVBlockArray; +TRTLLM_NAMESPACE_BEGIN + namespace torch_ext { @@ -468,6 +470,8 @@ void mergeChunkedAttentionForMLA(torch::Tensor& merged_attn, torch::Tensor const } } // namespace torch_ext +TRTLLM_NAMESPACE_END + TORCH_LIBRARY_FRAGMENT(trtllm, m) { m.def( @@ -496,7 +500,7 @@ TORCH_LIBRARY_FRAGMENT(trtllm, m) TORCH_LIBRARY_IMPL(trtllm, CUDA, m) { - m.impl("load_paged_kv_cache_for_mla", &torch_ext::loadPagedKVCacheForMLA); + m.impl("load_paged_kv_cache_for_mla", &tensorrt_llm::torch_ext::loadPagedKVCacheForMLA); } TORCH_LIBRARY_FRAGMENT(trtllm, m) @@ -527,7 +531,7 @@ TORCH_LIBRARY_FRAGMENT(trtllm, m) TORCH_LIBRARY_IMPL(trtllm, CUDA, m) { - m.impl("load_chunked_kv_cache_for_mla", &torch_ext::loadChunkedKVCacheForMLA); + m.impl("load_chunked_kv_cache_for_mla", &tensorrt_llm::torch_ext::loadChunkedKVCacheForMLA); } TORCH_LIBRARY_FRAGMENT(trtllm, m) @@ -562,7 +566,7 @@ TORCH_LIBRARY_FRAGMENT(trtllm, m) TORCH_LIBRARY_IMPL(trtllm, CUDA, m) { - m.impl("mla_rope_append_paged_kv_assign_q", &torch_ext::MLARopeAppendPagedKVAssignQ); + m.impl("mla_rope_append_paged_kv_assign_q", &tensorrt_llm::torch_ext::MLARopeAppendPagedKVAssignQ); } TORCH_LIBRARY_FRAGMENT(trtllm, m) @@ -584,5 +588,5 @@ TORCH_LIBRARY_FRAGMENT(trtllm, m) TORCH_LIBRARY_IMPL(trtllm, CUDA, m) { - m.impl("merge_chunked_attention_for_mla", &torch_ext::mergeChunkedAttentionForMLA); + m.impl("merge_chunked_attention_for_mla", &tensorrt_llm::torch_ext::mergeChunkedAttentionForMLA); } diff --git a/cpp/tensorrt_llm/thop/moeAlignOp.cpp b/cpp/tensorrt_llm/thop/moeAlignOp.cpp index b12b7fc401..d28b9261af 100644 --- a/cpp/tensorrt_llm/thop/moeAlignOp.cpp +++ b/cpp/tensorrt_llm/thop/moeAlignOp.cpp @@ -14,12 +14,15 @@ * limitations under the License. */ +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/kernels/moeAlignKernels.h" #include "thUtils.h" #include namespace tk = tensorrt_llm::kernels; +TRTLLM_NAMESPACE_BEGIN + namespace torch_ext { @@ -46,6 +49,8 @@ void moeAlignBlockSizeOp(torch::Tensor topk_ids, int64_t num_experts, int64_t bl } // namespace torch_ext +TRTLLM_NAMESPACE_END + TORCH_LIBRARY_FRAGMENT(trtllm, m) { m.def( @@ -55,5 +60,5 @@ TORCH_LIBRARY_FRAGMENT(trtllm, m) TORCH_LIBRARY_IMPL(trtllm, CUDA, m) { - m.impl("moe_align_block_size", &torch_ext::moeAlignBlockSizeOp); + m.impl("moe_align_block_size", &tensorrt_llm::torch_ext::moeAlignBlockSizeOp); } diff --git a/cpp/tensorrt_llm/thop/moeAlltoAllMeta.h b/cpp/tensorrt_llm/thop/moeAlltoAllMeta.h index ef37af4bc1..d8634e6a4f 100644 --- a/cpp/tensorrt_llm/thop/moeAlltoAllMeta.h +++ b/cpp/tensorrt_llm/thop/moeAlltoAllMeta.h @@ -16,11 +16,15 @@ #pragma once +#include "tensorrt_llm/common/config.h" + #include #include #include #include +TRTLLM_NAMESPACE_BEGIN + namespace torch_ext { namespace moe_comm @@ -63,3 +67,5 @@ inline std::vector> getMoeA2AMetaInfoIndexPairs( } // namespace moe_comm } // namespace torch_ext + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/thop/moeAlltoAllOp.cpp b/cpp/tensorrt_llm/thop/moeAlltoAllOp.cpp index 2a74f36457..e11135ddfb 100644 --- a/cpp/tensorrt_llm/thop/moeAlltoAllOp.cpp +++ b/cpp/tensorrt_llm/thop/moeAlltoAllOp.cpp @@ -25,6 +25,8 @@ #include #include +TRTLLM_NAMESPACE_BEGIN + namespace torch_ext { @@ -521,6 +523,8 @@ int64_t moeA2AGetAuxDataSizeOp(int64_t epSize, int64_t maxNumTokens) } // namespace torch_ext +TRTLLM_NAMESPACE_END + // PyTorch bindings TORCH_LIBRARY_FRAGMENT(trtllm, module) { @@ -546,14 +550,15 @@ TORCH_LIBRARY_FRAGMENT(trtllm, module) "runtime_max_tokens_per_rank, " "int combine_payload_offset, ScalarType out_dtype, int hidden_size) -> Tensor(a)"); module.def("moe_a2a_get_aux_data_size(int ep_size, int max_num_tokens) -> int", - &torch_ext::moe_comm::moeA2AGetAuxDataSizeOp); + &tensorrt_llm::torch_ext::moe_comm::moeA2AGetAuxDataSizeOp); } TORCH_LIBRARY_IMPL(trtllm, CUDA, module) { - module.impl("moe_a2a_dispatch", &torch_ext::moe_comm::moeA2ADispatchOp); - module.impl("moe_a2a_combine", &torch_ext::moe_comm::moeA2ACombineOp); - module.impl("moe_a2a_initialize", &torch_ext::moe_comm::moeA2AInitializeOp); - module.impl("moe_a2a_sanitize_expert_ids", &torch_ext::moe_comm::moeA2ASanitizeExpertIdsOp); - module.impl("moe_a2a_get_combine_payload_tensor", &torch_ext::moe_comm::moeA2AGetCombinePayloadTensorOp); + module.impl("moe_a2a_dispatch", &tensorrt_llm::torch_ext::moe_comm::moeA2ADispatchOp); + module.impl("moe_a2a_combine", &tensorrt_llm::torch_ext::moe_comm::moeA2ACombineOp); + module.impl("moe_a2a_initialize", &tensorrt_llm::torch_ext::moe_comm::moeA2AInitializeOp); + module.impl("moe_a2a_sanitize_expert_ids", &tensorrt_llm::torch_ext::moe_comm::moeA2ASanitizeExpertIdsOp); + module.impl( + "moe_a2a_get_combine_payload_tensor", &tensorrt_llm::torch_ext::moe_comm::moeA2AGetCombinePayloadTensorOp); } diff --git a/cpp/tensorrt_llm/thop/moeCommOp.cpp b/cpp/tensorrt_llm/thop/moeCommOp.cpp index af8ed85b5b..aaf5255b39 100644 --- a/cpp/tensorrt_llm/thop/moeCommOp.cpp +++ b/cpp/tensorrt_llm/thop/moeCommOp.cpp @@ -25,6 +25,8 @@ #include #include +TRTLLM_NAMESPACE_BEGIN + namespace torch_ext { @@ -249,6 +251,8 @@ void memsetExpertIds(torch::Tensor expertsIds, torch::Tensor recvRankCountCumSum } // namespace torch_ext +TRTLLM_NAMESPACE_END + TORCH_LIBRARY_FRAGMENT(trtllm, m) { m.def( @@ -259,7 +263,7 @@ TORCH_LIBRARY_FRAGMENT(trtllm, m) TORCH_LIBRARY_IMPL(trtllm, CUDA, m) { - m.impl("moe_comm", &torch_ext::moeCommOp); + m.impl("moe_comm", &tensorrt_llm::torch_ext::moeCommOp); } TORCH_LIBRARY_FRAGMENT(trtllm, m) @@ -269,7 +273,7 @@ TORCH_LIBRARY_FRAGMENT(trtllm, m) TORCH_LIBRARY_IMPL(trtllm, CUDA, m) { - m.impl("moe_initialize_workspace", &torch_ext::initializeMoeWorkspace); + m.impl("moe_initialize_workspace", &tensorrt_llm::torch_ext::initializeMoeWorkspace); } TORCH_LIBRARY_FRAGMENT(trtllm, m) @@ -279,7 +283,7 @@ TORCH_LIBRARY_FRAGMENT(trtllm, m) TORCH_LIBRARY_IMPL(trtllm, CompositeExplicitAutograd, m) { - m.impl("get_moe_commworkspace_size_per_rank", &torch_ext::getWorkspaceSizePerRank); + m.impl("get_moe_commworkspace_size_per_rank", &tensorrt_llm::torch_ext::getWorkspaceSizePerRank); } TORCH_LIBRARY_FRAGMENT(trtllm, m) @@ -289,7 +293,7 @@ TORCH_LIBRARY_FRAGMENT(trtllm, m) TORCH_LIBRARY_IMPL(trtllm, CompositeExplicitAutograd, m) { - m.impl("set_moe_max_usable_sm_count", &torch_ext::setMaxUsableSmCount); + m.impl("set_moe_max_usable_sm_count", &tensorrt_llm::torch_ext::setMaxUsableSmCount); } TORCH_LIBRARY_FRAGMENT(trtllm, m) @@ -302,7 +306,7 @@ TORCH_LIBRARY_FRAGMENT(trtllm, m) TORCH_LIBRARY_IMPL(trtllm, CUDA, m) { - m.impl("mnnvl_moe_alltoallv_prepare_without_allgather", &torch_ext::moePrepareOp); + m.impl("mnnvl_moe_alltoallv_prepare_without_allgather", &tensorrt_llm::torch_ext::moePrepareOp); } TORCH_LIBRARY_FRAGMENT(trtllm, m) @@ -315,7 +319,7 @@ TORCH_LIBRARY_FRAGMENT(trtllm, m) TORCH_LIBRARY_IMPL(trtllm, CUDA, m) { - m.impl("memset_expert_ids", &torch_ext::memsetExpertIds); + m.impl("memset_expert_ids", &tensorrt_llm::torch_ext::memsetExpertIds); } TORCH_LIBRARY_FRAGMENT(trtllm, m) @@ -325,5 +329,5 @@ TORCH_LIBRARY_FRAGMENT(trtllm, m) TORCH_LIBRARY_IMPL(trtllm, CompositeExplicitAutograd, m) { - m.impl("get_moe_prepare_workspace_size_per_rank", &torch_ext::getPrepareWorkspaceSizePerRank); + m.impl("get_moe_prepare_workspace_size_per_rank", &tensorrt_llm::torch_ext::getPrepareWorkspaceSizePerRank); } diff --git a/cpp/tensorrt_llm/thop/moeLoadBalanceOp.cpp b/cpp/tensorrt_llm/thop/moeLoadBalanceOp.cpp index 4cc7bbd4b3..aacf3a62e9 100644 --- a/cpp/tensorrt_llm/thop/moeLoadBalanceOp.cpp +++ b/cpp/tensorrt_llm/thop/moeLoadBalanceOp.cpp @@ -29,6 +29,8 @@ #include "tensorrt_llm/runtime/moeLoadBalancer/hostAccessibleDeviceAllocator.h" #include "tensorrt_llm/runtime/moeLoadBalancer/moeLoadBalancer.h" +TRTLLM_NAMESPACE_BEGIN + namespace torch_ext { @@ -200,6 +202,8 @@ void migrateToHostAccessible(at::Tensor& tensor) } // namespace torch_ext +TRTLLM_NAMESPACE_END + TORCH_LIBRARY_FRAGMENT(trtllm, m) { m.def("moe_load_balance_wait_gpu_stage(int single_layer_load_balancer_ptr) -> Tensor"); @@ -207,7 +211,7 @@ TORCH_LIBRARY_FRAGMENT(trtllm, m) TORCH_LIBRARY_IMPL(trtllm, CompositeExplicitAutograd, m) { - m.impl("moe_load_balance_wait_gpu_stage", &torch_ext::moeLoadBalanceWaitGpuStage); + m.impl("moe_load_balance_wait_gpu_stage", &tensorrt_llm::torch_ext::moeLoadBalanceWaitGpuStage); } TORCH_LIBRARY_FRAGMENT(trtllm, m) @@ -217,7 +221,7 @@ TORCH_LIBRARY_FRAGMENT(trtllm, m) TORCH_LIBRARY_IMPL(trtllm, CompositeExplicitAutograd, m) { - m.impl("moe_load_balance_set_cpu_stage", &torch_ext::moeLoadBalanceSetCpuStage); + m.impl("moe_load_balance_set_cpu_stage", &tensorrt_llm::torch_ext::moeLoadBalanceSetCpuStage); } TORCH_LIBRARY_FRAGMENT(trtllm, m) @@ -229,7 +233,7 @@ TORCH_LIBRARY_FRAGMENT(trtllm, m) TORCH_LIBRARY_IMPL(trtllm, CUDA, m) { - m.impl("moe_load_balance_statistic", &torch_ext::moeLoadBalanceStatistic); + m.impl("moe_load_balance_statistic", &tensorrt_llm::torch_ext::moeLoadBalanceStatistic); } TORCH_LIBRARY_FRAGMENT(trtllm, m) @@ -242,7 +246,7 @@ TORCH_LIBRARY_FRAGMENT(trtllm, m) TORCH_LIBRARY_IMPL(trtllm, CUDA, m) { - m.impl("moe_hierarchical_statistic_local_device", &torch_ext::moeHierarchicalStatisticLocalDevice); + m.impl("moe_hierarchical_statistic_local_device", &tensorrt_llm::torch_ext::moeHierarchicalStatisticLocalDevice); } TORCH_LIBRARY_FRAGMENT(trtllm, m) @@ -254,7 +258,7 @@ TORCH_LIBRARY_FRAGMENT(trtllm, m) TORCH_LIBRARY_IMPL(trtllm, CUDA, m) { - m.impl("moe_hierarchical_statistic_update", &torch_ext::moeHierarchicalStatisticUpdate); + m.impl("moe_hierarchical_statistic_update", &tensorrt_llm::torch_ext::moeHierarchicalStatisticUpdate); } TORCH_LIBRARY_FRAGMENT(trtllm, m) @@ -266,7 +270,7 @@ TORCH_LIBRARY_FRAGMENT(trtllm, m) TORCH_LIBRARY_IMPL(trtllm, CUDA, m) { - m.impl("moe_load_balance_routing", &torch_ext::moeLoadBalanceRouting); + m.impl("moe_load_balance_routing", &tensorrt_llm::torch_ext::moeLoadBalanceRouting); } TORCH_LIBRARY_FRAGMENT(trtllm, m) @@ -276,5 +280,5 @@ TORCH_LIBRARY_FRAGMENT(trtllm, m) TORCH_LIBRARY_IMPL(trtllm, CUDA, m) { - m.impl("migrate_to_host_accessible", &torch_ext::migrateToHostAccessible); + m.impl("migrate_to_host_accessible", &tensorrt_llm::torch_ext::migrateToHostAccessible); } diff --git a/cpp/tensorrt_llm/thop/moeOp.cpp b/cpp/tensorrt_llm/thop/moeOp.cpp index 953de1c58f..ae62b0a32e 100644 --- a/cpp/tensorrt_llm/thop/moeOp.cpp +++ b/cpp/tensorrt_llm/thop/moeOp.cpp @@ -23,6 +23,7 @@ // Always include the public header for moe_gemm_kernels.h #include "tensorrt_llm/kernels/cutlass_kernels/include/moe_gemm_kernels.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/workspace.h" #include "tensorrt_llm/kernels/cutlass_kernels/fp8_blockscale_gemm/fp8_blockscale_gemm.h" #include "tensorrt_llm/kernels/cutlass_kernels/include/cutlass_kernel_selector.h" @@ -42,6 +43,8 @@ C10_THROW_ERROR(ErrorType, oss.str()); \ } while (0) +TRTLLM_NAMESPACE_BEGIN + namespace torch_ext { @@ -234,6 +237,7 @@ public: mProfiler = std::make_shared(); mGemm1Profiles = mKernelRunner->getTactics(MoeGemmId::GEMM_1); mGemm2Profiles = mKernelRunner->getTactics(MoeGemmId::GEMM_2); + cuInit(0); } ~FusedMoeRunner() @@ -1193,12 +1197,14 @@ private: } // namespace torch_ext +TRTLLM_NAMESPACE_END + TORCH_LIBRARY(trtllm, m) { - m.class_("FusedMoeRunner") + m.class_("FusedMoeRunner") .def(torch::init()) - .def("run_gemm_profile", &torch_ext::FusedMoeRunner::runGemmProfile) - .def("get_tactic_num", &torch_ext::FusedMoeRunner::getTacticNum) - .def("run_moe", &torch_ext::FusedMoeRunner::runMoe) - .def("run_moe_min_latency", &torch_ext::FusedMoeRunner::runMoeMinLantency); + .def("run_gemm_profile", &tensorrt_llm::torch_ext::FusedMoeRunner::runGemmProfile) + .def("get_tactic_num", &tensorrt_llm::torch_ext::FusedMoeRunner::getTacticNum) + .def("run_moe", &tensorrt_llm::torch_ext::FusedMoeRunner::runMoe) + .def("run_moe_min_latency", &tensorrt_llm::torch_ext::FusedMoeRunner::runMoeMinLantency); } diff --git a/cpp/tensorrt_llm/thop/moeUtilOp.cpp b/cpp/tensorrt_llm/thop/moeUtilOp.cpp index cd1f327066..c11fe1703b 100644 --- a/cpp/tensorrt_llm/thop/moeUtilOp.cpp +++ b/cpp/tensorrt_llm/thop/moeUtilOp.cpp @@ -32,6 +32,8 @@ namespace common = tensorrt_llm::common; namespace kernels = tensorrt_llm::kernels; namespace cutlass_kernels = tensorrt_llm::kernels::cutlass_kernels; +TRTLLM_NAMESPACE_BEGIN + namespace torch_ext { @@ -329,6 +331,8 @@ torch::Tensor run_moe_finalize_scale_op(torch::Tensor const& gemm2_output, torch } // namespace torch_ext +TRTLLM_NAMESPACE_END + TORCH_LIBRARY_FRAGMENT(trtllm, m) { m.def( @@ -347,6 +351,6 @@ TORCH_LIBRARY_FRAGMENT(trtllm, m) TORCH_LIBRARY_IMPL(trtllm, CUDA, m) { - m.impl("moe_permute_op", &torch_ext::moe_permute_op); - m.impl("moe_finalize_scale_op", &torch_ext::run_moe_finalize_scale_op); + m.impl("moe_permute_op", &tensorrt_llm::torch_ext::moe_permute_op); + m.impl("moe_finalize_scale_op", &tensorrt_llm::torch_ext::run_moe_finalize_scale_op); } diff --git a/cpp/tensorrt_llm/thop/mxFp4BlockScaleMoe.cpp b/cpp/tensorrt_llm/thop/mxFp4BlockScaleMoe.cpp index 2fdc8573cf..087871593e 100644 --- a/cpp/tensorrt_llm/thop/mxFp4BlockScaleMoe.cpp +++ b/cpp/tensorrt_llm/thop/mxFp4BlockScaleMoe.cpp @@ -25,6 +25,8 @@ #include #include +TRTLLM_NAMESPACE_BEGIN + namespace torch_ext { namespace btg = batchedGemm::trtllm::gen; @@ -664,16 +666,18 @@ private: } // namespace torch_ext +TRTLLM_NAMESPACE_END + // Accepts CUDA tensor only TORCH_LIBRARY_FRAGMENT(trtllm, m) { - m.class_("Bf16MxE2m1BlockScaleMoERunner") + m.class_("Bf16MxE2m1BlockScaleMoERunner") .def(torch::init()) - .def("get_valid_configs", &torch_ext::Bf16MxE2m1BlockScaleMoeRunner::getValidConfigs) - .def("run_moe", &torch_ext::Bf16MxE2m1BlockScaleMoeRunner::run); + .def("get_valid_configs", &tensorrt_llm::torch_ext::Bf16MxE2m1BlockScaleMoeRunner::getValidConfigs) + .def("run_moe", &tensorrt_llm::torch_ext::Bf16MxE2m1BlockScaleMoeRunner::run); - m.class_("MxE4m3MxE2m1BlockScaleMoERunner") + m.class_("MxE4m3MxE2m1BlockScaleMoERunner") .def(torch::init()) - .def("get_valid_configs", &torch_ext::MxE4m3MxE2m1BlockScaleMoeRunner::getValidConfigs) - .def("run_moe", &torch_ext::MxE4m3MxE2m1BlockScaleMoeRunner::run); + .def("get_valid_configs", &tensorrt_llm::torch_ext::MxE4m3MxE2m1BlockScaleMoeRunner::getValidConfigs) + .def("run_moe", &tensorrt_llm::torch_ext::MxE4m3MxE2m1BlockScaleMoeRunner::run); } diff --git a/cpp/tensorrt_llm/thop/mxFp8Quantize.cpp b/cpp/tensorrt_llm/thop/mxFp8Quantize.cpp index ba651f2886..306e09e1c1 100644 --- a/cpp/tensorrt_llm/thop/mxFp8Quantize.cpp +++ b/cpp/tensorrt_llm/thop/mxFp8Quantize.cpp @@ -24,6 +24,8 @@ #include +TRTLLM_NAMESPACE_BEGIN + namespace torch_ext { // self: [M, K], fp16/bf16/fp8_quantized @@ -102,6 +104,8 @@ std::tuple mxfp8_quantize( } } // namespace torch_ext +TRTLLM_NAMESPACE_END + TORCH_LIBRARY_FRAGMENT(trtllm, m) { m.def( @@ -111,5 +115,5 @@ TORCH_LIBRARY_FRAGMENT(trtllm, m) TORCH_LIBRARY_IMPL(trtllm, CUDA, m) { - m.impl("mxfp8_quantize", &torch_ext::mxfp8_quantize); + m.impl("mxfp8_quantize", &tensorrt_llm::torch_ext::mxfp8_quantize); } diff --git a/cpp/tensorrt_llm/thop/ncclCommunicatorOp.cpp b/cpp/tensorrt_llm/thop/ncclCommunicatorOp.cpp index 22a33e27b2..75ae96f36b 100644 --- a/cpp/tensorrt_llm/thop/ncclCommunicatorOp.cpp +++ b/cpp/tensorrt_llm/thop/ncclCommunicatorOp.cpp @@ -20,6 +20,8 @@ namespace tr = tensorrt_llm::runtime; +TRTLLM_NAMESPACE_BEGIN + namespace torch_ext { @@ -47,7 +49,10 @@ void NcclCommunicatorOp::recv(th::Tensor& tensor, int64_t fromRank) const } // namespace torch_ext -static auto trtllmNcclCommunicator = torch::jit::class_("trtllm", "NcclCommunicatorOp") - .def(torch::jit::init()) - .def("send", &torch_ext::NcclCommunicatorOp::send) - .def("recv", &torch_ext::NcclCommunicatorOp::recv); +TRTLLM_NAMESPACE_END + +static auto trtllmNcclCommunicator + = torch::jit::class_("trtllm", "NcclCommunicatorOp") + .def(torch::jit::init()) + .def("send", &tensorrt_llm::torch_ext::NcclCommunicatorOp::send) + .def("recv", &tensorrt_llm::torch_ext::NcclCommunicatorOp::recv); diff --git a/cpp/tensorrt_llm/thop/ncclCommunicatorOp.h b/cpp/tensorrt_llm/thop/ncclCommunicatorOp.h old mode 100755 new mode 100644 index 4cf376c0ef..38f4d215ac --- a/cpp/tensorrt_llm/thop/ncclCommunicatorOp.h +++ b/cpp/tensorrt_llm/thop/ncclCommunicatorOp.h @@ -15,12 +15,15 @@ */ #pragma once +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/runtime/ncclCommunicator.h" #include "tensorrt_llm/thop/thUtils.h" #include namespace th = torch; +TRTLLM_NAMESPACE_BEGIN + namespace torch_ext { @@ -38,3 +41,5 @@ private: }; } // namespace torch_ext + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/thop/noAuxTcOp.cpp b/cpp/tensorrt_llm/thop/noAuxTcOp.cpp index 0804fb96b9..e445206e1d 100644 --- a/cpp/tensorrt_llm/thop/noAuxTcOp.cpp +++ b/cpp/tensorrt_llm/thop/noAuxTcOp.cpp @@ -32,6 +32,8 @@ namespace th = torch; namespace tl = tensorrt_llm; namespace tk = tensorrt_llm::kernels; +TRTLLM_NAMESPACE_BEGIN + namespace torch_ext { std::tuple noaux_tc_op(th::Tensor const& scores, th::Tensor const& bias, int64_t n_group, @@ -157,6 +159,8 @@ std::tuple noaux_tc_op(th::Tensor const& scores, th::Ten } // end namespace torch_ext +TRTLLM_NAMESPACE_END + TORCH_LIBRARY_FRAGMENT(trtllm, m) { m.def( @@ -166,5 +170,5 @@ TORCH_LIBRARY_FRAGMENT(trtllm, m) TORCH_LIBRARY_IMPL(trtllm, CUDA, m) { - m.impl("noaux_tc_op", &torch_ext::noaux_tc_op); + m.impl("noaux_tc_op", &tensorrt_llm::torch_ext::noaux_tc_op); } diff --git a/cpp/tensorrt_llm/thop/parallelDecodeKVCacheUpdateOp.cpp b/cpp/tensorrt_llm/thop/parallelDecodeKVCacheUpdateOp.cpp index 4c7b3d733a..400cf81033 100644 --- a/cpp/tensorrt_llm/thop/parallelDecodeKVCacheUpdateOp.cpp +++ b/cpp/tensorrt_llm/thop/parallelDecodeKVCacheUpdateOp.cpp @@ -23,6 +23,8 @@ namespace th = torch; namespace tksd = tensorrt_llm::kernels::speculative_decoding; +TRTLLM_NAMESPACE_BEGIN + namespace torch_ext { @@ -113,5 +115,7 @@ void updateKVCacheDraftTokenLocation(torch::Tensor seqAcceptedDraftTokenOffsetsT } // namespace torch_ext +TRTLLM_NAMESPACE_END + static auto update_kv_cache_draft_token_location = torch::RegisterOperators( - "tensorrt_llm::update_kv_cache_draft_token_location", &torch_ext::updateKVCacheDraftTokenLocation); + "tensorrt_llm::update_kv_cache_draft_token_location", &tensorrt_llm::torch_ext::updateKVCacheDraftTokenLocation); diff --git a/cpp/tensorrt_llm/thop/redrafterCurandOp.cpp b/cpp/tensorrt_llm/thop/redrafterCurandOp.cpp index 7ff79e0c22..d72622b6c8 100644 --- a/cpp/tensorrt_llm/thop/redrafterCurandOp.cpp +++ b/cpp/tensorrt_llm/thop/redrafterCurandOp.cpp @@ -35,6 +35,8 @@ namespace tr = tensorrt_llm::runtime; namespace tk = tensorrt_llm::kernels; namespace tksd = tensorrt_llm::kernels::speculative_decoding; +TRTLLM_NAMESPACE_BEGIN + namespace torch_ext { @@ -163,5 +165,7 @@ void prepareRandomTensors(th::Tensor& curandState, // [maxBatchSize, 48], uint8_ } // namespace torch_ext -static auto redrafter_prepare_random_tensors - = torch::RegisterOperators("tensorrt_llm::redrafter_prepare_random_tensors", &torch_ext::prepareRandomTensors); +TRTLLM_NAMESPACE_END + +static auto redrafter_prepare_random_tensors = torch::RegisterOperators( + "tensorrt_llm::redrafter_prepare_random_tensors", &tensorrt_llm::torch_ext::prepareRandomTensors); diff --git a/cpp/tensorrt_llm/thop/reducescatterOp.cpp b/cpp/tensorrt_llm/thop/reducescatterOp.cpp index a8f1d93ee1..40f89e40ff 100644 --- a/cpp/tensorrt_llm/thop/reducescatterOp.cpp +++ b/cpp/tensorrt_llm/thop/reducescatterOp.cpp @@ -34,6 +34,8 @@ using tensorrt_llm::pg_utils::PgHelper; +TRTLLM_NAMESPACE_BEGIN + namespace torch_ext { #if ENABLE_MULTI_DEVICE @@ -287,6 +289,8 @@ extern std::vector reducescatter_list_pg(torch::TensorList input_ } } // namespace torch_ext +TRTLLM_NAMESPACE_END + TORCH_LIBRARY_FRAGMENT(trtllm, m) { m.def("reducescatter(Tensor input, SymInt[]? sizes, int[] group) -> Tensor"); @@ -301,8 +305,8 @@ TORCH_LIBRARY_FRAGMENT(trtllm, m) TORCH_LIBRARY_IMPL(trtllm, CUDA, m) { - m.impl("reducescatter", &torch_ext::reducescatter); - m.impl("reducescatter_pg", &torch_ext::reducescatter_pg); - m.impl("reducescatter_list", &torch_ext::reducescatter_list); - m.impl("reducescatter_list_pg", &torch_ext::reducescatter_list_pg); + m.impl("reducescatter", &tensorrt_llm::torch_ext::reducescatter); + m.impl("reducescatter_pg", &tensorrt_llm::torch_ext::reducescatter_pg); + m.impl("reducescatter_list", &tensorrt_llm::torch_ext::reducescatter_list); + m.impl("reducescatter_list_pg", &tensorrt_llm::torch_ext::reducescatter_list_pg); } diff --git a/cpp/tensorrt_llm/thop/relativeAttentionBiasOp.cpp b/cpp/tensorrt_llm/thop/relativeAttentionBiasOp.cpp index b2b3f366a3..36306ac815 100644 --- a/cpp/tensorrt_llm/thop/relativeAttentionBiasOp.cpp +++ b/cpp/tensorrt_llm/thop/relativeAttentionBiasOp.cpp @@ -21,6 +21,8 @@ namespace th = torch; namespace tl = tensorrt_llm; namespace tk = tensorrt_llm::kernels; +TRTLLM_NAMESPACE_BEGIN + namespace torch_ext { @@ -68,5 +70,7 @@ void buildRelativeAttentionBias( } // namespace torch_ext -static auto relative_attention_bias - = torch::RegisterOperators("tensorrt_llm::relative_attention_bias", &torch_ext::buildRelativeAttentionBias); +TRTLLM_NAMESPACE_END + +static auto relative_attention_bias = torch::RegisterOperators( + "tensorrt_llm::relative_attention_bias", &tensorrt_llm::torch_ext::buildRelativeAttentionBias); diff --git a/cpp/tensorrt_llm/thop/selectiveScanOp.cpp b/cpp/tensorrt_llm/thop/selectiveScanOp.cpp index 46bcfda217..4414a3ce5d 100644 --- a/cpp/tensorrt_llm/thop/selectiveScanOp.cpp +++ b/cpp/tensorrt_llm/thop/selectiveScanOp.cpp @@ -21,6 +21,8 @@ namespace th = torch; namespace tk = tensorrt_llm::kernels; +TRTLLM_NAMESPACE_BEGIN + namespace torch_ext { @@ -244,6 +246,8 @@ std::tuple selective_scan(th::Tensor const& input, th::T } // namespace torch_ext +TRTLLM_NAMESPACE_END + TORCH_LIBRARY_FRAGMENT(trtllm, m) { m.def( @@ -259,5 +263,5 @@ TORCH_LIBRARY_FRAGMENT(trtllm, m) TORCH_LIBRARY_IMPL(trtllm, CUDA, m) { - m.impl("selective_scan", &torch_ext::selective_scan); + m.impl("selective_scan", &tensorrt_llm::torch_ext::selective_scan); } diff --git a/cpp/tensorrt_llm/thop/specDecOp.cpp b/cpp/tensorrt_llm/thop/specDecOp.cpp index c68c08e29e..5f4111574e 100644 --- a/cpp/tensorrt_llm/thop/specDecOp.cpp +++ b/cpp/tensorrt_llm/thop/specDecOp.cpp @@ -15,6 +15,7 @@ * limitations under the License. */ +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaUtils.h" #include "tensorrt_llm/common/opUtils.h" #include "tensorrt_llm/kernels/speculativeDecoding/draftTokenTreeKernels.h" @@ -25,6 +26,8 @@ namespace th = torch; namespace tl = tensorrt_llm; namespace tk = tensorrt_llm::kernels; +TRTLLM_NAMESPACE_BEGIN + namespace torch_ext { @@ -336,6 +339,8 @@ void extract_real_draft_tokens_op(th::Tensor newDraftTokens, th::Tensor draftTok } // end namespace torch_ext +TRTLLM_NAMESPACE_END + TORCH_LIBRARY_FRAGMENT(trtllm, m) { m.def( @@ -348,7 +353,7 @@ TORCH_LIBRARY_FRAGMENT(trtllm, m) TORCH_LIBRARY_IMPL(trtllm, CUDA, m) { - m.impl("mtp_prepare_drafter_inputs_op", &torch_ext::mtp_prepare_drafter_inputs_op); + m.impl("mtp_prepare_drafter_inputs_op", &tensorrt_llm::torch_ext::mtp_prepare_drafter_inputs_op); } //////////////////////////////////////////////////////////////////////////////////////////////////////////// @@ -363,7 +368,8 @@ TORCH_LIBRARY_FRAGMENT(trtllm, m) TORCH_LIBRARY_IMPL(trtllm, CUDA, m) { - m.impl("mtp_sampling_and_accepted_draft_tokens_op", &torch_ext::mtp_sampling_and_accepted_draft_tokens_op); + m.impl("mtp_sampling_and_accepted_draft_tokens_op", + &tensorrt_llm::torch_ext::mtp_sampling_and_accepted_draft_tokens_op); } //////////////////////////////////////////////////////////////////////////////////////////////////////////// @@ -378,7 +384,7 @@ TORCH_LIBRARY_FRAGMENT(trtllm, m) TORCH_LIBRARY_IMPL(trtllm, CUDA, m) { - m.impl("mtp_update_hidden_states_op", &torch_ext::mtp_update_hidden_states_op); + m.impl("mtp_update_hidden_states_op", &tensorrt_llm::torch_ext::mtp_update_hidden_states_op); } //////////////////////////////////////////////////////////////////////////////////////////////////////////// @@ -394,7 +400,7 @@ TORCH_LIBRARY_FRAGMENT(trtllm, m) TORCH_LIBRARY_IMPL(trtllm, CUDA, m) { - m.impl("mtp_relaxed_acceptance_op", &torch_ext::mtp_relaxed_acceptance_op); + m.impl("mtp_relaxed_acceptance_op", &tensorrt_llm::torch_ext::mtp_relaxed_acceptance_op); } //////////////////////////////////////////////////////////////////////////////////////////////////////////// @@ -409,5 +415,5 @@ TORCH_LIBRARY_FRAGMENT(trtllm, m) TORCH_LIBRARY_IMPL(trtllm, CUDA, m) { - m.impl("extract_real_draft_tokens_op", &torch_ext::extract_real_draft_tokens_op); + m.impl("extract_real_draft_tokens_op", &tensorrt_llm::torch_ext::extract_real_draft_tokens_op); } diff --git a/cpp/tensorrt_llm/thop/thUtils.cpp b/cpp/tensorrt_llm/thop/thUtils.cpp index 5c81856999..97fe6acaab 100644 --- a/cpp/tensorrt_llm/thop/thUtils.cpp +++ b/cpp/tensorrt_llm/thop/thUtils.cpp @@ -18,6 +18,8 @@ #include #include +TRTLLM_NAMESPACE_BEGIN + namespace torch_ext { @@ -111,3 +113,5 @@ cudaDataType_t convert_torch_dtype(torch::ScalarType dtype) } } // namespace torch_ext + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/thop/thUtils.h b/cpp/tensorrt_llm/thop/thUtils.h index 3ca6701ee2..04ec60e007 100644 --- a/cpp/tensorrt_llm/thop/thUtils.h +++ b/cpp/tensorrt_llm/thop/thUtils.h @@ -16,6 +16,7 @@ #pragma once +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/runtime/iTensor.h" #include @@ -54,6 +55,8 @@ #define PRINT_TENSOR(x) std::cout << #x << ":\n" << x << std::endl #define PRINT_TENSOR_SIZE(x) std::cout << "size of " << #x << ": " << x.sizes() << std::endl +TRTLLM_NAMESPACE_BEGIN + namespace torch_ext { @@ -93,3 +96,5 @@ std::optional getFloatEnv(char const* name); cudaDataType_t convert_torch_dtype(torch::ScalarType dtype); } // namespace torch_ext + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/thop/tinygemm2.cpp b/cpp/tensorrt_llm/thop/tinygemm2.cpp index 3be0bea04b..b617a65812 100644 --- a/cpp/tensorrt_llm/thop/tinygemm2.cpp +++ b/cpp/tensorrt_llm/thop/tinygemm2.cpp @@ -26,6 +26,8 @@ torch::Tensor tinygemm2_cuda_forward(torch::Tensor input, torch::Tensor weight, torch::Tensor bias); // C++ interface +TRTLLM_NAMESPACE_BEGIN + namespace torch_ext { torch::Tensor tinygemm2_forward(torch::Tensor input, torch::Tensor weight, torch::Tensor bias) @@ -45,6 +47,8 @@ torch::Tensor tinygemm2_forward(torch::Tensor input, torch::Tensor weight, torch } } // namespace torch_ext +TRTLLM_NAMESPACE_END + TORCH_LIBRARY_FRAGMENT(trtllm, m) { m.def("tinygemm2(Tensor input, Tensor weight, Tensor bias) -> Tensor"); @@ -52,5 +56,5 @@ TORCH_LIBRARY_FRAGMENT(trtllm, m) TORCH_LIBRARY_IMPL(trtllm, CUDA, m) { - m.impl("tinygemm2", &torch_ext::tinygemm2_forward); + m.impl("tinygemm2", &tensorrt_llm::torch_ext::tinygemm2_forward); } diff --git a/cpp/tensorrt_llm/thop/userbuffersFinalizeOp.cpp b/cpp/tensorrt_llm/thop/userbuffersFinalizeOp.cpp index f29ea57e71..3857259b2b 100644 --- a/cpp/tensorrt_llm/thop/userbuffersFinalizeOp.cpp +++ b/cpp/tensorrt_llm/thop/userbuffersFinalizeOp.cpp @@ -34,7 +34,7 @@ torch::Tensor userbuffers_allreduce_finalize(torch::Tensor input, bool force_app int hidden_size = input.size(-1); auto& ub_manager = tensorrt_llm::runtime::ub::UserBuffersManager::get_instance(); - auto [output, ub_buffer] = torch_ext::create_userbuffers_tensor(input.sizes(), input.scalar_type()); + auto [output, ub_buffer] = tensorrt_llm::torch_ext::create_userbuffers_tensor(input.sizes(), input.scalar_type()); auto const dtype = tensorrt_llm::runtime::TorchUtils::dataType(input.scalar_type()); diff --git a/cpp/tensorrt_llm/thop/userbuffersTensor.cpp b/cpp/tensorrt_llm/thop/userbuffersTensor.cpp index 4318f38bcd..47c1ea6998 100644 --- a/cpp/tensorrt_llm/thop/userbuffersTensor.cpp +++ b/cpp/tensorrt_llm/thop/userbuffersTensor.cpp @@ -15,6 +15,8 @@ */ #include "userbuffersTensor.h" +TRTLLM_NAMESPACE_BEGIN + namespace torch_ext { @@ -46,7 +48,9 @@ torch::Tensor create_userbuffers_tensor_op(at::IntArrayRef shape, torch::ScalarT } // namespace torch_ext +TRTLLM_NAMESPACE_END + TORCH_LIBRARY_FRAGMENT(trtllm, m) { - m.def("create_userbuffers_tensor", &torch_ext::create_userbuffers_tensor_op); + m.def("create_userbuffers_tensor", &tensorrt_llm::torch_ext::create_userbuffers_tensor_op); } diff --git a/cpp/tensorrt_llm/thop/userbuffersTensor.h b/cpp/tensorrt_llm/thop/userbuffersTensor.h index 86c634c7ff..861c3e6620 100644 --- a/cpp/tensorrt_llm/thop/userbuffersTensor.h +++ b/cpp/tensorrt_llm/thop/userbuffersTensor.h @@ -15,9 +15,12 @@ */ #pragma once +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/kernels/userbuffers/userbuffersManager.h" #include +TRTLLM_NAMESPACE_BEGIN + namespace torch_ext { @@ -25,3 +28,5 @@ std::pair create_userbuffers at::IntArrayRef shape, torch::ScalarType dtype); } // namespace torch_ext + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/thop/weightOnlyQuantGemm.cpp b/cpp/tensorrt_llm/thop/weightOnlyQuantGemm.cpp index a00b51e16e..b8cfac19a8 100644 --- a/cpp/tensorrt_llm/thop/weightOnlyQuantGemm.cpp +++ b/cpp/tensorrt_llm/thop/weightOnlyQuantGemm.cpp @@ -15,6 +15,7 @@ */ #include "weightOnlyQuantGemm.h" #include "cutlass/numeric_types.h" +#include "tensorrt_llm/common/config.h" #include #include @@ -22,6 +23,8 @@ using namespace tensorrt_llm::kernels::cutlass_kernels; using namespace tensorrt_llm::kernels; +TRTLLM_NAMESPACE_BEGIN + namespace torch_ext { @@ -156,10 +159,12 @@ int64_t WeightOnlyQuantGemmRunner::getNumConfigs() const } // namespace torch_ext +TRTLLM_NAMESPACE_END + TORCH_LIBRARY_FRAGMENT(trtllm, m) { - m.class_("WeightOnlyQuantGemmRunner") + m.class_("WeightOnlyQuantGemmRunner") .def(torch::init()) - .def("run_gemm", &torch_ext::WeightOnlyQuantGemmRunner::runGemm) - .def("get_num_configs", &torch_ext::WeightOnlyQuantGemmRunner::getNumConfigs); + .def("run_gemm", &tensorrt_llm::torch_ext::WeightOnlyQuantGemmRunner::runGemm) + .def("get_num_configs", &tensorrt_llm::torch_ext::WeightOnlyQuantGemmRunner::getNumConfigs); } diff --git a/cpp/tensorrt_llm/thop/weightOnlyQuantGemm.h b/cpp/tensorrt_llm/thop/weightOnlyQuantGemm.h index df062d79a5..0b08b51b36 100644 --- a/cpp/tensorrt_llm/thop/weightOnlyQuantGemm.h +++ b/cpp/tensorrt_llm/thop/weightOnlyQuantGemm.h @@ -18,6 +18,7 @@ #include "cutlass_extensions/gemm_configs.h" #include "cutlass_extensions/weight_only_quant_op.h" +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaUtils.h" #include "tensorrt_llm/kernels/cutlass_kernels/cutlass_heuristic.h" #include "tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm.h" @@ -29,6 +30,8 @@ using namespace tensorrt_llm::kernels::cutlass_kernels; using namespace tensorrt_llm::kernels; +TRTLLM_NAMESPACE_BEGIN + namespace torch_ext { using WeightOnlyQuantGemmRunnerPtr = std::shared_ptr; @@ -51,3 +54,5 @@ private: }; } // namespace torch_ext + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/thop/weightOnlyQuantOp.cpp b/cpp/tensorrt_llm/thop/weightOnlyQuantOp.cpp index b6feba15e6..89c3312b9b 100644 --- a/cpp/tensorrt_llm/thop/weightOnlyQuantOp.cpp +++ b/cpp/tensorrt_llm/thop/weightOnlyQuantOp.cpp @@ -14,6 +14,7 @@ * limitations under the License. */ +#include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaBf16Wrapper.h" #include "tensorrt_llm/kernels/cutlass_kernels/cutlass_preprocessors.h" #include "tensorrt_llm/thop/thUtils.h" @@ -23,6 +24,8 @@ #define TORCH_IS_AT_LEAST_v190 #endif +TRTLLM_NAMESPACE_BEGIN + namespace torch_ext { using torch::Tensor; @@ -400,35 +403,38 @@ Tensor mxfp4_dequantize_unswizzled(Tensor weight, Tensor scale, int64_t group_si } // namespace torch_ext +TRTLLM_NAMESPACE_END + // Utility methods that may be useful for preprocessing weights in torch. static auto symmetric_quantize_last_axis_of_batched_matrix = torch::RegisterOperators("trtllm::symmetric_quantize_last_axis_of_batched_matrix", - &torch_ext::symmetric_quantize_last_axis_of_batched_matrix); + &tensorrt_llm::torch_ext::symmetric_quantize_last_axis_of_batched_matrix); static auto preprocess_weights_for_mixed_gemm = torch::RegisterOperators( - "trtllm::preprocess_weights_for_mixed_gemm", &torch_ext::preprocess_weights_for_mixed_gemm); + "trtllm::preprocess_weights_for_mixed_gemm", &tensorrt_llm::torch_ext::preprocess_weights_for_mixed_gemm); static auto unpack_int4_packed_tensor_to_int8 = torch::RegisterOperators( - "trtllm::unpack_int4_packed_tensor_to_int8", &torch_ext::unpack_int4_packed_tensor_to_int8); + "trtllm::unpack_int4_packed_tensor_to_int8", &tensorrt_llm::torch_ext::unpack_int4_packed_tensor_to_int8); -static auto pack_int8_tensor_to_packed_int4 - = torch::RegisterOperators("trtllm::pack_int8_tensor_to_packed_int4", &torch_ext::pack_int8_tensor_to_packed_int4); +static auto pack_int8_tensor_to_packed_int4 = torch::RegisterOperators( + "trtllm::pack_int8_tensor_to_packed_int4", &tensorrt_llm::torch_ext::pack_int8_tensor_to_packed_int4); // Utility methods exposed purely for unit tests in torch. static auto _symmetric_quantize_last_axis_of_batched_matrix = torch::RegisterOperators("trtllm::_symmetric_quantize_last_axis_of_batched_matrix", - &torch_ext::_symmetric_quantize_last_axis_of_batched_matrix); + &tensorrt_llm::torch_ext::_symmetric_quantize_last_axis_of_batched_matrix); -static auto add_bias_and_interleave_int4s - = torch::RegisterOperators("trtllm::_add_bias_and_interleave_int4s", &torch_ext::add_bias_and_interleave_int4s); +static auto add_bias_and_interleave_int4s = torch::RegisterOperators( + "trtllm::_add_bias_and_interleave_int4s", &tensorrt_llm::torch_ext::add_bias_and_interleave_int4s); -static auto add_bias_and_interleave_int8s - = torch::RegisterOperators("trtllm::_add_bias_and_interleave_int8s", &torch_ext::add_bias_and_interleave_int8s); +static auto add_bias_and_interleave_int8s = torch::RegisterOperators( + "trtllm::_add_bias_and_interleave_int8s", &tensorrt_llm::torch_ext::add_bias_and_interleave_int8s); -static auto permute_B_rows_for_mixed_gemm - = torch::RegisterOperators("trtllm::_permute_B_rows_for_mixed_gemm", &torch_ext::permute_B_rows_for_mixed_gemm); +static auto permute_B_rows_for_mixed_gemm = torch::RegisterOperators( + "trtllm::_permute_B_rows_for_mixed_gemm", &tensorrt_llm::torch_ext::permute_B_rows_for_mixed_gemm); -static auto subbyte_transpose = torch::RegisterOperators("trtllm::_subbyte_transpose", &torch_ext::subbyte_transpose); +static auto subbyte_transpose + = torch::RegisterOperators("trtllm::_subbyte_transpose", &tensorrt_llm::torch_ext::subbyte_transpose); -static auto mxfp4_dequantize_unswizzled - = torch::RegisterOperators("trtllm::mxfp4_dequantize_unswizzled", &torch_ext::mxfp4_dequantize_unswizzled); +static auto mxfp4_dequantize_unswizzled = torch::RegisterOperators( + "trtllm::mxfp4_dequantize_unswizzled", &tensorrt_llm::torch_ext::mxfp4_dequantize_unswizzled); diff --git a/cpp/tests/unit_tests/multi_gpu/mpiUtilsTest.cpp b/cpp/tests/unit_tests/multi_gpu/mpiUtilsTest.cpp index 303ed40117..221cd98b5f 100644 --- a/cpp/tests/unit_tests/multi_gpu/mpiUtilsTest.cpp +++ b/cpp/tests/unit_tests/multi_gpu/mpiUtilsTest.cpp @@ -178,6 +178,7 @@ void testSendMRecv() } else if (rank == 1) { +#if ENABLE_MULTI_DEVICE MPI_Message msg; MPI_Status status; comm.mprobe(0, tag, &msg, &status); @@ -190,6 +191,7 @@ void testSendMRecv() MPICHECK( MPI_Mrecv(&value, count, getMpiDtype(mpi::MpiTypeConverter>::value), &msg, &status)); EXPECT_EQ(value, expectedValue); +#endif // ENABLE_MULTI_DEVICE } } diff --git a/cpp/tests/unit_tests/multi_gpu/ncclUtilsTest.cpp b/cpp/tests/unit_tests/multi_gpu/ncclUtilsTest.cpp index bf4ddd2141..88533ce7ca 100644 --- a/cpp/tests/unit_tests/multi_gpu/ncclUtilsTest.cpp +++ b/cpp/tests/unit_tests/multi_gpu/ncclUtilsTest.cpp @@ -36,7 +36,7 @@ namespace mpi = tensorrt_llm::mpi; namespace tr = tensorrt_llm::runtime; namespace nccl_util = tensorrt_llm::common::nccl_util; -using ::getComm; +using tensorrt_llm::getComm; // Helper function to create a split communicator for testing // This allows us to test cleanup behavior explicitly by controlling the lifetime diff --git a/cpp/tests/unit_tests/thop/thUtilsTest.cpp b/cpp/tests/unit_tests/thop/thUtilsTest.cpp index 262609cad8..06bf41b8fb 100644 --- a/cpp/tests/unit_tests/thop/thUtilsTest.cpp +++ b/cpp/tests/unit_tests/thop/thUtilsTest.cpp @@ -19,7 +19,7 @@ #include "tensorrt_llm/thop/thUtils.h" #include -using namespace torch_ext; +using namespace tensorrt_llm::torch_ext; TEST(ThUtils, ConvertShape2D) { diff --git a/scripts/build_wheel.py b/scripts/build_wheel.py index bd836df4f5..03aae58617 100755 --- a/scripts/build_wheel.py +++ b/scripts/build_wheel.py @@ -290,6 +290,11 @@ def generate_fmha_cu(project_dir, venv_python): move_if_updated(fmha_v2_dir / "generated/fmha_cubin.h", cubin_dir / "fmha_cubin.h") + # Copy generated source file (fmha_cubin.cpp) to the same directory as header + cpp_src = fmha_v2_dir / "generated/fmha_cubin.cpp" + if cpp_src.exists(): + move_if_updated(cpp_src, cubin_dir / "fmha_cubin.cpp") + generated_files = set() for cu_file in (fmha_v2_dir / "generated").glob("*sm*.cu"): dst_file = fmha_v2_cu_dir / os.path.basename(cu_file) diff --git a/tensorrt_llm/_torch/custom_ops/cpp_custom_ops.py b/tensorrt_llm/_torch/custom_ops/cpp_custom_ops.py index 3a611a640c..348e665475 100644 --- a/tensorrt_llm/_torch/custom_ops/cpp_custom_ops.py +++ b/tensorrt_llm/_torch/custom_ops/cpp_custom_ops.py @@ -5,7 +5,10 @@ import torch import tensorrt_llm.quantization.utils.fp4_utils as fp4_utils from ..._utils import get_sm_version -from .cute_dsl_custom_ops import GroupedGemmInputsHelper +from ..cute_dsl_utils import IS_CUTLASS_DSL_AVAILABLE + +if IS_CUTLASS_DSL_AVAILABLE: + from .cute_dsl_custom_ops import GroupedGemmInputsHelper def _register_fake(): @@ -486,104 +489,106 @@ def _register_fake(): return gemm2_output.new_empty((num_rows_val, unpadded_hidden_size_val), dtype=gemm2_output.dtype) - @torch.library.register_fake("trtllm::moe_topk_sort") - def _( - routing_logits: torch.Tensor, - routing_bias: Optional[torch.Tensor], - num_experts: int, - top_k: int, - n_group: Optional[int], - topk_group: Optional[int], - local_expert_offset: int, - local_num_experts: int, - routed_scaling_factor: Optional[float], - tile_tokens_dim: int, - routing_method_type: int, - ) -> List[torch.Tensor]: - helper = GroupedGemmInputsHelper( - num_experts=num_experts, - top_k=top_k, - num_local_experts=local_num_experts, - local_expert_offset=local_expert_offset, - tile_size=tile_tokens_dim, - ) - num_tokens = routing_logits.size(0) - device = routing_logits.device - routing_bias_dtype = torch.bfloat16 if routing_bias is None else routing_bias.dtype - max_num_tiles = helper.get_max_num_tiles(num_tokens) - max_num_permuted_tokens = helper.get_max_num_permuted_tokens(num_tokens) - tile_idx_to_expert_idx = torch.empty((max_num_tiles, ), - dtype=torch.int32, - device=device) - tile_idx_to_mn_limit = torch.empty((max_num_tiles, ), - dtype=torch.int32, - device=device) - expanded_idx_to_permuted_idx = torch.empty((num_tokens, top_k), - dtype=torch.int32, - device=device) - permuted_idx_to_expanded_idx = torch.empty((max_num_permuted_tokens, ), - dtype=torch.int32, - device=device) - total_num_padded_tokens = torch.empty((1, ), - dtype=torch.int32, - device=device) - num_non_exiting_tiles = torch.empty((1, ), - dtype=torch.int32, - device=device) - new_token_final_scales = torch.empty((num_tokens, top_k), - dtype=routing_bias_dtype, - device=device) - return [ - tile_idx_to_expert_idx, tile_idx_to_mn_limit, - expanded_idx_to_permuted_idx, permuted_idx_to_expanded_idx, - total_num_padded_tokens, num_non_exiting_tiles, - new_token_final_scales - ] + if IS_CUTLASS_DSL_AVAILABLE: - @torch.library.register_fake("trtllm::moe_sort") - def _( - token_selected_experts: torch.Tensor, - token_final_scales: torch.Tensor, - num_experts: int, - top_k: int, - local_expert_offset: int, - local_num_experts: int, - tile_tokens_dim: int, - ) -> List[torch.Tensor]: - helper = GroupedGemmInputsHelper( - num_experts=num_experts, - top_k=top_k, - num_local_experts=local_num_experts, - local_expert_offset=local_expert_offset, - tile_size=tile_tokens_dim, - ) - num_tokens = token_selected_experts.size(0) - device = token_selected_experts.device - max_num_tiles = helper.get_max_num_tiles(num_tokens) - max_num_permuted_tokens = helper.get_max_num_permuted_tokens(num_tokens) - tile_idx_to_expert_idx = torch.empty((max_num_tiles, ), - dtype=torch.int32, - device=device) - tile_idx_to_mn_limit = torch.empty((max_num_tiles, ), - dtype=torch.int32, - device=device) - expanded_idx_to_permuted_idx = torch.empty((num_tokens, top_k), - dtype=torch.int32, - device=device) - permuted_idx_to_expanded_idx = torch.empty((max_num_permuted_tokens, ), - dtype=torch.int32, - device=device) - total_num_padded_tokens = torch.empty((1, ), - dtype=torch.int32, - device=device) - num_non_exiting_tiles = torch.empty((1, ), - dtype=torch.int32, - device=device) - return [ - tile_idx_to_expert_idx, tile_idx_to_mn_limit, - expanded_idx_to_permuted_idx, permuted_idx_to_expanded_idx, - total_num_padded_tokens, num_non_exiting_tiles - ] + @torch.library.register_fake("trtllm::moe_topk_sort") + def _( + routing_logits: torch.Tensor, + routing_bias: Optional[torch.Tensor], + num_experts: int, + top_k: int, + n_group: Optional[int], + topk_group: Optional[int], + local_expert_offset: int, + local_num_experts: int, + routed_scaling_factor: Optional[float], + tile_tokens_dim: int, + routing_method_type: int, + ) -> List[torch.Tensor]: + helper = GroupedGemmInputsHelper( + num_experts=num_experts, + top_k=top_k, + num_local_experts=local_num_experts, + local_expert_offset=local_expert_offset, + tile_size=tile_tokens_dim, + ) + num_tokens = routing_logits.size(0) + device = routing_logits.device + routing_bias_dtype = torch.bfloat16 if routing_bias is None else routing_bias.dtype + max_num_tiles = helper.get_max_num_tiles(num_tokens) + max_num_permuted_tokens = helper.get_max_num_permuted_tokens( + num_tokens) + tile_idx_to_expert_idx = torch.empty((max_num_tiles, ), + dtype=torch.int32, + device=device) + tile_idx_to_mn_limit = torch.empty((max_num_tiles, ), + dtype=torch.int32, + device=device) + expanded_idx_to_permuted_idx = torch.empty((num_tokens, top_k), + dtype=torch.int32, + device=device) + permuted_idx_to_expanded_idx = torch.empty( + (max_num_permuted_tokens, ), dtype=torch.int32, device=device) + total_num_padded_tokens = torch.empty((1, ), + dtype=torch.int32, + device=device) + num_non_exiting_tiles = torch.empty((1, ), + dtype=torch.int32, + device=device) + new_token_final_scales = torch.empty((num_tokens, top_k), + dtype=routing_bias_dtype, + device=device) + return [ + tile_idx_to_expert_idx, tile_idx_to_mn_limit, + expanded_idx_to_permuted_idx, permuted_idx_to_expanded_idx, + total_num_padded_tokens, num_non_exiting_tiles, + new_token_final_scales + ] + + @torch.library.register_fake("trtllm::moe_sort") + def _( + token_selected_experts: torch.Tensor, + token_final_scales: torch.Tensor, + num_experts: int, + top_k: int, + local_expert_offset: int, + local_num_experts: int, + tile_tokens_dim: int, + ) -> List[torch.Tensor]: + helper = GroupedGemmInputsHelper( + num_experts=num_experts, + top_k=top_k, + num_local_experts=local_num_experts, + local_expert_offset=local_expert_offset, + tile_size=tile_tokens_dim, + ) + num_tokens = token_selected_experts.size(0) + device = token_selected_experts.device + max_num_tiles = helper.get_max_num_tiles(num_tokens) + max_num_permuted_tokens = helper.get_max_num_permuted_tokens( + num_tokens) + tile_idx_to_expert_idx = torch.empty((max_num_tiles, ), + dtype=torch.int32, + device=device) + tile_idx_to_mn_limit = torch.empty((max_num_tiles, ), + dtype=torch.int32, + device=device) + expanded_idx_to_permuted_idx = torch.empty((num_tokens, top_k), + dtype=torch.int32, + device=device) + permuted_idx_to_expanded_idx = torch.empty( + (max_num_permuted_tokens, ), dtype=torch.int32, device=device) + total_num_padded_tokens = torch.empty((1, ), + dtype=torch.int32, + device=device) + num_non_exiting_tiles = torch.empty((1, ), + dtype=torch.int32, + device=device) + return [ + tile_idx_to_expert_idx, tile_idx_to_mn_limit, + expanded_idx_to_permuted_idx, permuted_idx_to_expanded_idx, + total_num_padded_tokens, num_non_exiting_tiles + ] @torch.library.register_fake("trtllm::moe_permute") def _( diff --git a/tests/unittest/_torch/attention/test_trtllm_flashinfer_symbol_collision.py b/tests/unittest/_torch/attention/test_trtllm_flashinfer_symbol_collision.py new file mode 100644 index 0000000000..54cf23d6cb --- /dev/null +++ b/tests/unittest/_torch/attention/test_trtllm_flashinfer_symbol_collision.py @@ -0,0 +1,83 @@ +"""Unit tests for FlashInfer fused MOE custom op.""" + +import flashinfer.fused_moe +import pytest +import torch + +import tensorrt_llm._torch.auto_deploy.custom_ops.fused_moe.torch_moe # noqa: F401 +import tensorrt_llm._torch.custom_ops.torch_custom_ops as trt_ops # noqa: F401 + + +def test_flashinfer_fused_moe_matches_torch_moe(): + """Test that flashinfer_fused_moe matches torch_moe reference.""" + torch.manual_seed(0) + + if not torch.cuda.is_available(): + pytest.skip("CUDA is required for flashinfer_fused_moe test") + + device = "cuda" + dtype = torch.bfloat16 + + # Small test case + M = 8 # tokens + HIDDEN_SIZE = 64 + INTERMEDIATE_SIZE = 128 + E = 4 # experts + top_k = 2 + + # Input + x = torch.randn(M, HIDDEN_SIZE, device=device, dtype=dtype) + + # Expert weights for gated MLP (SwiGLU) + # w1 = gate projection, w3 = up projection, w2 = down projection + w1_list = [ + torch.randn(INTERMEDIATE_SIZE, HIDDEN_SIZE, device=device, dtype=dtype) for _ in range(E) + ] + w2_list = [ + torch.randn(HIDDEN_SIZE, INTERMEDIATE_SIZE, device=device, dtype=dtype) for _ in range(E) + ] + w3_list = [ + torch.randn(INTERMEDIATE_SIZE, HIDDEN_SIZE, device=device, dtype=dtype) for _ in range(E) + ] + + # FlashInfer expects fc1 (gate + up concatenated) and fc2 (down) + # fc1_expert_weights: [E, 2*INTERMEDIATE_SIZE, HIDDEN_SIZE] + w1_w3_stacked = torch.stack( + [torch.cat([w3, w1], dim=0) for w1, w3 in zip(w1_list, w3_list)], dim=0 + ).contiguous() + + # fc2_expert_weights: [E, HIDDEN_SIZE, INTERMEDIATE_SIZE] + w2_stacked = torch.stack(w2_list, dim=0).contiguous() + + # Random routing with top-k normalization + router_logits = torch.randn(M, E, device=device, dtype=torch.float32) + routing_full = torch.softmax(router_logits, dim=-1) + routing_weights, selected_experts = torch.topk(routing_full, k=top_k, dim=-1) + routing_weights = routing_weights / routing_weights.sum(dim=-1, keepdim=True) + routing_weights = routing_weights.to(torch.float32) + + # FlashInfer fused MOE - call directly + out_flashinfer = flashinfer.fused_moe.cutlass_fused_moe( + input=x, + token_selected_experts=selected_experts.to(torch.int32), + token_final_scales=routing_weights, + fc1_expert_weights=w1_w3_stacked, + fc2_expert_weights=w2_stacked, + output_dtype=dtype, + quant_scales=[], + ) + + # Reference Torch MoE (gated_mlp with SwiGLU) + out_torch = torch.ops.auto_deploy.torch_moe( + x, + selected_experts, + routing_weights, + w1_weight=w1_list, # gate projection + w2_weight=w2_list, # down projection + w3_weight=w3_list, # up projection + mlp_style="gated_mlp", + act_fn="silu", + ) + + # Compare outputs + torch.testing.assert_close(out_flashinfer[0], out_torch, rtol=5e-1, atol=5e-1)